diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..f81f3717a4ead833784b63da35185f2d07409983 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/operators/kernel/mali/ACL_Android"] + path = src/operators/kernel/mali/ACL_Android + url = https://github.com/halsay/ACL_Android.git diff --git a/CMakeLists.txt b/CMakeLists.txt index d8aded9a17e9c7ee91ef245aeae3b6ff24212c6f..6feabdbe4374c9200c4282f620fadc27f3128bc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ option(USE_EXCEPTION "use std exception" ON) option(LOG_PROFILE "log profile" ON) # select the platform to build option(CPU "armv7 with neon" ON) -option(MALI_GPU "mali gpu" OFF) +option(MALI_GPU "mali gpu" ON) option(FPGA "fpga" OFF) set(DEBUGING ON) if (CPU) @@ -15,7 +15,18 @@ if (CPU) endif() if (MALI_GPU) - add_definitions(-DPADDLE_MOBILE_MALI_GPU) + add_definitions(-DPADDLE_MOBILE_MALI_GPU) + add_definitions(-DUSE_ACL=1) + add_definitions(-DUSE_OPENCL) + set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android) + include_directories(${ACL_ROOT} ${ACL_ROOT}/include) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1") endif() if(FPGA) diff --git a/src/framework/operator.h b/src/framework/operator.h index c85a38d73c620ae4b08387b548bd2f4f8ca71711..c68744a676030413e81570ded0db5671cdf4ba7a 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -138,9 +138,21 @@ class OpKernelBase { * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体, * 所有结构体存在与: paddle-mobile/src/operators/op_param.h * */ +#ifdef PADDLE_MOBILE_MALI_GPU + OpKernelBase() { acl_op_ = nullptr; } + void *GetAclOp() const { return acl_op_; } + void SetAclOp(void *op, void *ob) const { + reinterpret_cast *>(ob)->acl_op_ = op; + } +#endif virtual void Compute(const P ¶) const = 0; virtual bool Init(const P ¶) const { return true; }; virtual ~OpKernelBase() = default; + + private: +#ifdef PADDLE_MOBILE_MALI_GPU + void *acl_op_; +#endif }; #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \ diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp index 672e990be44c11df0795b9c6f301803f8ad02285..5d94d54f88e33b168739b1bbdf9af0bea9fe1b4f 100644 --- a/src/operators/batchnorm_op.cpp +++ b/src/operators/batchnorm_op.cpp @@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm); REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(batch_norm); +REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp index f5a9c3d81ef34ac9ff643dd174741e083c879cbc..fe0507dc812a3ddafcc0433c2659c3b49ea87f6e 100644 --- a/src/operators/concat_op.cpp +++ b/src/operators/concat_op.cpp @@ -67,6 +67,8 @@ USE_OP_CPU(concat); REGISTER_OPERATOR_CPU(concat, ops::ConcatOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(concat); +REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp index 966bc9c1e77a4ae6e33bc830c06ba7593c7ba3e0..12c59da6452992e3dd73b985db685a651df02250 100644 --- a/src/operators/elementwise_add_op.cpp +++ b/src/operators/elementwise_add_op.cpp @@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add); REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(elementwise_add); +REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 034cf947871a962b786b66e3752d86f5a327f342..bd5fd8cb32d484b7f76652139603f6b0f1b4b5d7 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -50,6 +50,8 @@ USE_OP_CPU(feed); REGISTER_OPERATOR_CPU(feed, ops::FeedOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(feed); +REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h index c28424f0d1880c9f7f44c6644a163215d639f7a3..4b3680b58357d8295b1b6acf111d3573d4e4d1bd 100644 --- a/src/operators/fetch_op.h +++ b/src/operators/fetch_op.h @@ -50,6 +50,8 @@ USE_OP_CPU(fetch); REGISTER_OPERATOR_CPU(fetch, ops::FetchOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(fetch); +REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp index 2605414c892f89787701334f428621d9d8c2520f..4c01603509b0a1d9da2c2dc31a38719d5117e05c 100644 --- a/src/operators/fusion_conv_add.cpp +++ b/src/operators/fusion_conv_add.cpp @@ -54,6 +54,8 @@ USE_OP_CPU(conv_add); REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(conv_add); +REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h index f0a3ea17d9a86e2c8638c164cfa2bf21d4fb727d..24f1d3f63b3300db9b60a595466a0ced3b9e996b 100644 --- a/src/operators/fusion_conv_add.h +++ b/src/operators/fusion_conv_add.h @@ -68,11 +68,23 @@ class FusionConvAddOp : public framework::OperatorWithKernel< }; #ifdef PADDLE_MOBILE_CPU +#ifndef CONV_ADD_REGISTER static framework::FusionOpRegistrar convadd_registrar( new FusionConvAddMatcher()); +#define CONV_ADD_REGISTER #endif +#endif + #ifdef PADDLE_MOBILE_MALI_GPU + +#ifndef CONV_ADD_REGISTER +static framework::FusionOpRegistrar convadd_registrar( + new FusionConvAddMatcher()); +#define CONV_ADD_REGISTER +#endif + #endif + #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h index b87f1c4110de6c525e4544d5a350b2beaf98af95..fd27005c8bef8f8cb91fbf5b6e5a852306c28a9b 100644 --- a/src/operators/fusion_conv_add_relu_op.h +++ b/src/operators/fusion_conv_add_relu_op.h @@ -64,8 +64,13 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel< }; #ifdef PADDLE_MOBILE_CPU + +#ifndef CONV_ADD_RELU_REGISTER +#define CONV_ADD_RELU_REGISTER // static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new // FusionConvAddReluOpMatcher()); +#endif + #endif #ifdef PADDLE_MOBILE_MALI_GPU #endif diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp index 2e4e098fd08e7a765a9f54eb6ed6a4dc579c359f..fae561348899dadc4c25f84ec3a0993d9ae693f9 100644 --- a/src/operators/fusion_fc_op.cpp +++ b/src/operators/fusion_fc_op.cpp @@ -59,6 +59,8 @@ USE_OP_CPU(fc); REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(fc); +REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h index 2035704bb60eb96bfb22fc4f277d30817efcf646..0ca4d2b27ad46b77ddba55b6b377e741c97bdc9e 100644 --- a/src/operators/fusion_fc_op.h +++ b/src/operators/fusion_fc_op.h @@ -66,11 +66,19 @@ class FusionFcOp }; #ifdef PADDLE_MOBILE_CPU +#ifndef CONV_CPU_REGISTER +#define CONV_CPU_REGISTER static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); #endif +#endif + #ifdef PADDLE_MOBILE_MALI_GPU -// static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); +#ifndef CONV_CPU_REGISTER +#define CONV_CPU_REGISTER +static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); #endif +#endif + #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp index 0f0ee95670f1911a83d39db7a0e784c50dc6c405..eaceaf78493dcf7e37768702ffeafa59febfec3a 100644 --- a/src/operators/kernel/arm/batchnorm_kernel.cpp +++ b/src/operators/kernel/arm/batchnorm_kernel.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/kernel/batchnorm_kernel.h" -#include +#include "operators/kernel/central-arm-func/batchnorm_arm_func.h" namespace paddle_mobile { namespace operators { @@ -28,215 +28,7 @@ bool BatchNormKernel::Init(const BatchNormParam ¶) const { template <> void BatchNormKernel::Compute(const BatchNormParam ¶m) const { - const Tensor *input_x = param.InputX(); - auto input_x_ptr = input_x->data(); - const auto &x_dims = input_x->dims(); - const int N = x_dims[0]; - const int C = x_dims[1]; - const int H = x_dims[2]; - const int W = x_dims[3]; - const int stride0 = C * H * W; - const int stride1 = H * W; - const int stride2 = W; - Tensor *out = param.OutputY(); - auto out_ptr = out->mutable_data(); - const float epsilon = param.Epsilon(); - const Tensor *mean = param.InputMean(); - const Tensor *variance = param.InputVariance(); - const Tensor *scale = param.InputScale(); - const Tensor *bias = param.InputBias(); - auto mean_ptr = mean->data(); - auto variance_ptr = variance->data(); - auto scale_ptr = scale->data(); - auto bias_ptr = bias->data(); - - // Tensor inv_std; - // auto inv_std_ptr = inv_std.mutable_data(make_ddim({C})); - - PADDLE_MOBILE_ENFORCE(C == variance->numel(), - "C must equal to variance.numel()"); - - int HXW = H * W; - if (HXW > 32) { - int NXC = N * C; - float *inv_std_ptr = new float[NXC * 4]; - float *volatile new_scale_ptr = new float[NXC * 4]; - float *volatile new_bias_ptr = new float[NXC * 4]; - - /// std = (var + epsilon).sqrt(); - /// inv_std = 1 / std; - for (int i = 0; i < C * 4; i += 4) { - int index = i / 4; - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[index] + epsilon), 0.5)); - inv_std_ptr[i + 1] = inv_std_ptr[i]; - inv_std_ptr[i + 2] = inv_std_ptr[i]; - inv_std_ptr[i + 3] = inv_std_ptr[i]; - - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index]; - new_scale_ptr[i + 1] = new_scale_ptr[i]; - new_scale_ptr[i + 2] = new_scale_ptr[i]; - new_scale_ptr[i + 3] = new_scale_ptr[i]; - - new_bias_ptr[i] = - bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index]; - - new_bias_ptr[i + 1] = new_bias_ptr[i]; - new_bias_ptr[i + 2] = new_bias_ptr[i]; - new_bias_ptr[i + 3] = new_bias_ptr[i]; - } - - for (int j = C * 4; j < NXC * 4; ++j) { - new_scale_ptr[j] = new_scale_ptr[j - C * 4]; - new_bias_ptr[j] = new_bias_ptr[j - C * 4]; - } - - asm volatile( - "subs %[N], %[N], #1 \n\t" - "blt end_n_%= \n\t" - "loop_n_%=: \n\t" - - "subs %[C], %[C], #1 \n\t" - "blt end_c_%= \n\t" - "loop_c_%=: \n\t" - - "vld1.32 {q9}, [%[new_scale_ptr]]! \n\t" - "vld1.32 {q10}, [%[new_bias_ptr]]! \n\t" - - "mov r6, %[HXW] \n\t" - - "subs r6, r6, #32 \n\t" - "blt end_hw_%= \n\t" - "loop_hw_%=: \n\t" - - "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" - - "vmul.f32 q1, q1, q9 \n\t" - "vmul.f32 q2, q2, q9 \n\t" - "vmul.f32 q3, q3, q9 \n\t" - "vmul.f32 q4, q4, q9 \n\t" - - "vmul.f32 q5, q5, q9 \n\t" - "vmul.f32 q6, q6, q9 \n\t" - "vmul.f32 q7, q7, q9 \n\t" - "vmul.f32 q8, q8, q9 \n\t" - - "vadd.f32 q1, q1, q10 \n\t" - "vadd.f32 q2, q2, q10 \n\t" - "vadd.f32 q3, q3, q10 \n\t" - "vadd.f32 q4, q4, q10 \n\t" - "vadd.f32 q5, q5, q10 \n\t" - "vadd.f32 q6, q6, q10 \n\t" - "vadd.f32 q7, q7, q10 \n\t" - "vadd.f32 q8, q8, q10 \n\t" - - "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" - "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" - "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" - "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" - - "subs r6, r6, #32 \n\t" - "bge loop_hw_%= \n\t" - "end_hw_%=: \n\t" - - "cmp r6, #0 \n\t" - "bge end_remainder_%= \n\t" - "mov r5, #4 \n\t" - "mul r6, r6, r5 \n\t" - "add %[input_x_ptr], %[input_x_ptr], r6 \n\t" - - "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" - - "vmul.f32 q1, q1, q9 \n\t" - "vmul.f32 q2, q2, q9 \n\t" - "vmul.f32 q3, q3, q9 \n\t" - "vmul.f32 q4, q4, q9 \n\t" - "vmul.f32 q5, q5, q9 \n\t" - "vmul.f32 q6, q6, q9 \n\t" - "vmul.f32 q7, q7, q9 \n\t" - "vmul.f32 q8, q8, q9 \n\t" - "vadd.f32 q1, q1, q10 \n\t" - "vadd.f32 q2, q2, q10 \n\t" - "vadd.f32 q3, q3, q10 \n\t" - "vadd.f32 q4, q4, q10 \n\t" - "vadd.f32 q5, q5, q10 \n\t" - "vadd.f32 q6, q6, q10 \n\t" - "vadd.f32 q7, q7, q10 \n\t" - "vadd.f32 q8, q8, q10 \n\t" - - "add %[out_ptr], %[out_ptr], r6 \n\t" - "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" - "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" - "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" - "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" - - "end_remainder_%=: \n\t" - - "subs %[C], %[C], #1 \n\t" - "bge loop_c_%= \n\t" - "end_c_%=: \n\t" - - "subs %[N], %[N], #1 \n\t" - "bge loop_n_%= \n\t" - "end_n_%=: \n\t" - : - : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr), - [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr), - [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "r5", "r6"); - - delete[] inv_std_ptr; - delete[] new_scale_ptr; - delete[] new_bias_ptr; - - } else { - float *inv_std_ptr = new float[C]; - for (int i = 0; i < C; i++) { - inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); - } - - Tensor new_scale; - auto new_scale_ptr = new_scale.mutable_data(make_ddim({C})); - Tensor new_bias; - auto new_bias_ptr = new_bias.mutable_data(make_ddim({C})); - - /// ((x - est_mean) * (inv_var) * scale + bias equal to - /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - for (int i = 0; i < C; i++) { - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = - bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; - { - for (int n = 0; n < N; n++) { - for (int h = 0; h < H; h++) { - int tmp_index = n * stride0 + i * stride1 + h * stride2; - for (int w = 0; w < W; w++) { - int index = tmp_index + w; - out_ptr[index] = - input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i]; - } - } - } - } - } - - delete[] inv_std_ptr; - // DLOG << "input[2,5,1,0](input[102]) ,channel 5 :"; - // DLOG << "input_x_ptr : " << input_x_ptr[102]; - // DLOG << "variance : " << variance_ptr[5]; - // DLOG << "inv_std_ptr : " << inv_std_ptr[5]; - // DLOG << "new_scale_ptr : " << new_scale_ptr[5]; - // DLOG << "new_bias_ptr : " << new_bias_ptr[5]; - // DLOG << "out_ptr : " << out_ptr[102]; - } + BatchnormCompute(param); } } // namespace operators diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp index 2df48222e0923e403f2ad44b3d5c4a89aceb4cc4..d3c04179b37014adc6c81f32dd6c08f697283671 100644 --- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADD_RELU_OP #include "operators/kernel/conv_add_relu_kernel.h" +#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h" namespace paddle_mobile { namespace operators { @@ -28,92 +29,7 @@ bool ConvAddReluKernel::Init( template <> void ConvAddReluKernel::Compute( const FusionConvAddReluParam ¶m) const { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); - int axis = param.Axis(); - Tensor *output = param.Output(); - math::expand_bias(bias, axis, output->dims()); - output->ShareDataWith(bias); - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), true); - } - } + ConvAddReluCompute(param); } template class ConvAddReluKernel; diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index ce7c8b2bb3d596bc365eecd31ae4181f37be5e38..049425d88f96a322a0b4cb47c18d85f2df03d577 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef CONV_OP #include "operators/kernel/conv_kernel.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -26,88 +27,7 @@ bool ConvKernel::Init(const ConvParam ¶) const { template <> void ConvKernel::Compute(const ConvParam ¶m) const { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor *output = param.Output(); - output->mutable_data(); - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0)); - } - } + ConvCompute(param); } template class ConvKernel; diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp index 6cd16fda0dc6ea9928ab9fcfac5cd8b3c31a15a2..4cbfa23248e87e2bf3a8d97330fa19f92985a9d0 100644 --- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef DEPTHWISECONV_OP #include "operators/kernel/depthwise_conv_kernel.h" -#include "operators/kernel/conv_kernel.h" +#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -27,91 +27,7 @@ bool DepthwiseConvKernel::Init(const ConvParam ¶) const { template <> void DepthwiseConvKernel::Compute(const ConvParam ¶m) const { - LOG(kLOG_DEBUG) << param; - - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor *output = param.Output(); - output->mutable_data(); - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - // DLOG << " compute end get Attrs " << strides[0]; - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0)); - } - } + DepthwiseConvCompute(param); } template class DepthwiseConvKernel; diff --git a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..7f02d768b790b5f496ab0eac369fa3a4100ee733 --- /dev/null +++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h @@ -0,0 +1,234 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef BATCHNORM_OP + +#pragma once + +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void BatchnormCompute(const BatchNormParam ¶m) { + const Tensor *input_x = param.InputX(); + auto input_x_ptr = input_x->data(); + const auto &x_dims = input_x->dims(); + const int N = x_dims[0]; + const int C = x_dims[1]; + const int H = x_dims[2]; + const int W = x_dims[3]; + const int stride0 = C * H * W; + const int stride1 = H * W; + const int stride2 = W; + Tensor *out = param.OutputY(); + auto out_ptr = out->mutable_data(); + const float epsilon = param.Epsilon(); + const Tensor *mean = param.InputMean(); + const Tensor *variance = param.InputVariance(); + const Tensor *scale = param.InputScale(); + const Tensor *bias = param.InputBias(); + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + // Tensor inv_std; + // auto inv_std_ptr = inv_std.mutable_data(make_ddim({C})); + + PADDLE_MOBILE_ENFORCE(C == variance->numel(), + "C must equal to variance.numel()"); + + int HXW = H * W; + if (HXW > 32) { + int NXC = N * C; + float *inv_std_ptr = new float[NXC * 4]; + float *volatile new_scale_ptr = new float[NXC * 4]; + float *volatile new_bias_ptr = new float[NXC * 4]; + + /// std = (var + epsilon).sqrt(); + /// inv_std = 1 / std; + for (int i = 0; i < C * 4; i += 4) { + int index = i / 4; + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[index] + epsilon), 0.5)); + inv_std_ptr[i + 1] = inv_std_ptr[i]; + inv_std_ptr[i + 2] = inv_std_ptr[i]; + inv_std_ptr[i + 3] = inv_std_ptr[i]; + + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index]; + new_scale_ptr[i + 1] = new_scale_ptr[i]; + new_scale_ptr[i + 2] = new_scale_ptr[i]; + new_scale_ptr[i + 3] = new_scale_ptr[i]; + + new_bias_ptr[i] = + bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index]; + + new_bias_ptr[i + 1] = new_bias_ptr[i]; + new_bias_ptr[i + 2] = new_bias_ptr[i]; + new_bias_ptr[i + 3] = new_bias_ptr[i]; + } + + for (int j = C * 4; j < NXC * 4; ++j) { + new_scale_ptr[j] = new_scale_ptr[j - C * 4]; + new_bias_ptr[j] = new_bias_ptr[j - C * 4]; + } + + asm volatile( + "subs %[N], %[N], #1 \n\t" + "blt end_n_%= \n\t" + "loop_n_%=: \n\t" + + "subs %[C], %[C], #1 \n\t" + "blt end_c_%= \n\t" + "loop_c_%=: \n\t" + + "vld1.32 {q9}, [%[new_scale_ptr]]! \n\t" + "vld1.32 {q10}, [%[new_bias_ptr]]! \n\t" + + "mov r6, %[HXW] \n\t" + + "subs r6, r6, #32 \n\t" + "blt end_hw_%= \n\t" + "loop_hw_%=: \n\t" + + "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" + + "vmul.f32 q1, q1, q9 \n\t" + "vmul.f32 q2, q2, q9 \n\t" + "vmul.f32 q3, q3, q9 \n\t" + "vmul.f32 q4, q4, q9 \n\t" + + "vmul.f32 q5, q5, q9 \n\t" + "vmul.f32 q6, q6, q9 \n\t" + "vmul.f32 q7, q7, q9 \n\t" + "vmul.f32 q8, q8, q9 \n\t" + + "vadd.f32 q1, q1, q10 \n\t" + "vadd.f32 q2, q2, q10 \n\t" + "vadd.f32 q3, q3, q10 \n\t" + "vadd.f32 q4, q4, q10 \n\t" + "vadd.f32 q5, q5, q10 \n\t" + "vadd.f32 q6, q6, q10 \n\t" + "vadd.f32 q7, q7, q10 \n\t" + "vadd.f32 q8, q8, q10 \n\t" + + "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" + "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" + "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" + "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" + + "subs r6, r6, #32 \n\t" + "bge loop_hw_%= \n\t" + "end_hw_%=: \n\t" + + "cmp r6, #0 \n\t" + "bge end_remainder_%= \n\t" + "mov r5, #4 \n\t" + "mul r6, r6, r5 \n\t" + "add %[input_x_ptr], %[input_x_ptr], r6 \n\t" + + "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" + + "vmul.f32 q1, q1, q9 \n\t" + "vmul.f32 q2, q2, q9 \n\t" + "vmul.f32 q3, q3, q9 \n\t" + "vmul.f32 q4, q4, q9 \n\t" + "vmul.f32 q5, q5, q9 \n\t" + "vmul.f32 q6, q6, q9 \n\t" + "vmul.f32 q7, q7, q9 \n\t" + "vmul.f32 q8, q8, q9 \n\t" + "vadd.f32 q1, q1, q10 \n\t" + "vadd.f32 q2, q2, q10 \n\t" + "vadd.f32 q3, q3, q10 \n\t" + "vadd.f32 q4, q4, q10 \n\t" + "vadd.f32 q5, q5, q10 \n\t" + "vadd.f32 q6, q6, q10 \n\t" + "vadd.f32 q7, q7, q10 \n\t" + "vadd.f32 q8, q8, q10 \n\t" + + "add %[out_ptr], %[out_ptr], r6 \n\t" + "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" + "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" + "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" + "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" + + "end_remainder_%=: \n\t" + + "subs %[C], %[C], #1 \n\t" + "bge loop_c_%= \n\t" + "end_c_%=: \n\t" + + "subs %[N], %[N], #1 \n\t" + "bge loop_n_%= \n\t" + "end_n_%=: \n\t" + : + : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr), + [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr), + [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "r5", "r6"); + + delete[] inv_std_ptr; + delete[] new_scale_ptr; + delete[] new_bias_ptr; + + } else { + float *inv_std_ptr = new float[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + + Tensor new_scale; + auto new_scale_ptr = + new_scale.mutable_data(framework::make_ddim({C})); + Tensor new_bias; + auto new_bias_ptr = new_bias.mutable_data(framework::make_ddim({C})); + + /// ((x - est_mean) * (inv_var) * scale + bias equal to + /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = + bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + { + for (int n = 0; n < N; n++) { + for (int h = 0; h < H; h++) { + int tmp_index = n * stride0 + i * stride1 + h * stride2; + for (int w = 0; w < W; w++) { + int index = tmp_index + w; + out_ptr[index] = + input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i]; + } + } + } + } + } + + delete[] inv_std_ptr; + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..6aadbab95c591d4286fdbb3c3f01a291cdd90429 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADD_RELU_OP + +#pragma once +#include +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor bias = *param.Bias(); + int axis = param.Axis(); + Tensor *output = param.Output(); + math::expand_bias(bias, axis, output->dims()); + output->ShareDataWith(bias); + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::matmul(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(1), true); + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..d08eebe5493bd9026073c3349631a42024579b95 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -0,0 +1,115 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_OP + +#pragma once +#include +#include "operators/math/conv_func.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void ConvCompute(const ConvParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor *output = param.Output(); + output->mutable_data(); + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::matmul(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(0)); + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..e43e3664cb005bab4d3c5ec8b5b35bd6925c982d --- /dev/null +++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef DEPTHWISECONV_OP + +#pragma once +#include +#include "operators/math/conv_func.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void DepthwiseConvCompute(const ConvParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor *output = param.Output(); + output->mutable_data(); + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + // DLOG << " compute end get Attrs " << strides[0]; + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::matmul(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(0)); + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h index 8efdc0bc067927d4a0f0d7fcef56d8a732e0b848..812ddd5a441f3a24c557546c1780248a557a6eb0 100644 --- a/src/operators/kernel/conv_kernel.h +++ b/src/operators/kernel/conv_kernel.h @@ -35,21 +35,6 @@ class ConvKernel : public OpKernelBase { bool Init(const ConvParam ¶) const; }; -inline bool IsExpand(const std::vector &filter_dim, - const std::vector &strides, - const std::vector &paddings, - const std::vector &dilations) { - bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; - for (size_t j = 0; j < strides.size(); ++j) { - filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); - strides_1 = strides_1 && (strides[j] == 1); - padding_0 = padding_0 && (paddings[j] == 0); - dilation_1 = dilation_1 && (dilations[j] == 1); - } - - return !(filter_1 && strides_1 && padding_0 && dilation_1); -} - } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/mali/ACL_Android b/src/operators/kernel/mali/ACL_Android new file mode 160000 index 0000000000000000000000000000000000000000..591027fcffea084100c756e48356e0f8a48e35e5 --- /dev/null +++ b/src/operators/kernel/mali/ACL_Android @@ -0,0 +1 @@ +Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5 diff --git a/src/operators/kernel/mali/acl_operator.cc b/src/operators/kernel/mali/acl_operator.cc new file mode 100644 index 0000000000000000000000000000000000000000..562d2fe1c46aa7a30b6418c7a3fcb21daafffa0f --- /dev/null +++ b/src/operators/kernel/mali/acl_operator.cc @@ -0,0 +1,220 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if USE_ACL == 1 +#include "acl_operator.h" +unsigned int bypass_acl_class_layer = + (0 | FLAGS_ENABLE_ACL_CONCAT | + /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN + |*/ + 0); + +int enable_schedule = 0; + +#ifdef USE_PROFILING + +#include "arm_neon.h" + +unsigned int acl_log_flags = + (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */ /*MASK_LOG_ALLOCATE | */ + /*MASK_LOG_RUN | */ /*MASK_LOG_CONFIG | */ /*MASK_LOG_COPY | */ + MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC | + MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID | + MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN | + MASK_LOG_CONCAT | 0); +#include /* printf */ +#include /* getenv */ +#endif // USE_PROFILING + +static bool force_enable_gpu = false; +bool AclEnableSchedule(int enable) { + enable_schedule = enable; + if (enable) { + force_enable_gpu = true; + } + return true; +} +int isScheduleEnable() { return enable_schedule; } + +namespace paddle_mobile { +namespace operators { +namespace acl { + +bool ACLOperator::init_gpu_env = true; +#ifdef USE_OPENCL +bool ACLOperator::support_opencl_ = false; +bool opencl_is_available() { return arm_compute::opencl_is_available(); } +#elif defined(USE_OPENGLES) +bool ACLOperator::support_opengles_ = false; +#endif +ACLOperator::ACLOperator(bool is_gpu) + : operator_state_(operator_not_init), + force_bypass_acl_path_(false), + target_hint_(TargetHint::DONT_CARE), + convolution_method_hint_(ConvolutionMethodHint::GEMM), + _group(1), + name_(""), + input_idx_(0), + output_idx_(0), + is_gpu_(is_gpu) { + const char* pBypassACL; + if (init_gpu_env) { +#ifdef USE_OPENCL + try { + if (opencl_is_available()) { + arm_compute::CLScheduler::get().default_init(); + support_opencl_ = true; + } + } catch (std::exception& e) { + support_opencl_ = false; + } +#elif defined(USE_OPENGLES) + try { + arm_compute::GCScheduler::get().default_init(); + support_opengles_ = true; + } catch (std::exception& e) { + support_opengles_ = false; + } +#endif + init_gpu_env = false; + } + if (force_enable_gpu) is_gpu_ = true; + pBypassACL = getenv("BYPASSACL"); + if (pBypassACL) { + unsigned int bacl; + sscanf(pBypassACL, "%i", &bacl); + if (bacl != bypass_acl_class_layer) { + bypass_acl_class_layer = bacl; + printf("BYPASSACL<%s>\n", pBypassACL); + printf("BYPASSACL: %x\n", bypass_acl_class_layer); + } + } + +#ifdef USE_PROFILING + const char* pLogACL; + pLogACL = getenv("LOGACL"); + if (pLogACL) { + unsigned int alf; + sscanf(pLogACL, "%i", &alf); + if (alf != acl_log_flags) { + acl_log_flags = alf; + printf("LOGACL<%s>\n", pLogACL); + printf("LOGACL: %x\n", acl_log_flags); + } + } +#endif // USE_PROFILING + const char* pEnableSchedule; + pEnableSchedule = getenv("ENABLESCHEDULE"); + if (pEnableSchedule) { + int bshedule; + sscanf(pEnableSchedule, "%i", &bshedule); + if (bshedule != enable_schedule) { + enable_schedule = bshedule; + printf("ENABLESCHEDULE<%s>\n", pEnableSchedule); + printf("ENABLESCHEDULE: %x\n", enable_schedule); + } + if (enable_schedule) { + AclEnableSchedule(1); + } + } +} +ACLOperator::~ACLOperator() {} + +bool ACLOperator::new_tensor(std::unique_ptr& tensor, + arm_compute::TensorShape& shape, void* mem, + bool commit) { + auto acl_tensor = + new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32)); + acl_tensor->set_target(getTargetHint()); + acl_tensor->bindmem(mem); + if (commit) acl_tensor->commit(); + tensor = (std::unique_ptr)std::move(acl_tensor); + return true; +} +bool ACLOperator::new_tensor(std::unique_ptr& tensor, + std::unique_ptr& parent, + arm_compute::TensorShape& shape, + arm_compute::Coordinates& coord) { + auto acl_tensor = new ACLSubTensor(parent, shape, coord); + acl_tensor->set_target(getTargetHint()); + tensor = (std::unique_ptr)std::move(acl_tensor); + return true; +} + +void ACLTensor::commit(TensorType type) { + settensortype(type); + if (mem_) { + if (!allocate_) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_ALLOCATE_INFO); +#endif // USE_PROFILING + allocate(); + allocate_ = true; + } + if (type_ != tensor_output) { + tensor_copy(mem_); + } + mem_ = nullptr; + } +} + +int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem, + bool toTensor) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_COPY_INFO); +#endif // USE_PROFILING + arm_compute::Window window; + // Iterate through the rows (not each element) + window.use_tensor_dimensions(tensor->info()->tensor_shape(), + /* first_dimension =*/arm_compute::Window::DimY); + + int width = tensor->info()->tensor_shape()[0]; + int height = tensor->info()->tensor_shape()[1]; + int deepth = tensor->info()->tensor_shape()[2]; + map(); + // Create an iterator: + arm_compute::Iterator it(tensor, window); + // Except it works for an arbitrary number of dimensions + if (toTensor) { // mem->tensor + arm_compute::execute_window_loop( + window, + [&](const arm_compute::Coordinates& id) { + memcpy(it.ptr(), + ((char*)mem) + + ((id[3] * (width * height * deepth) + + id.z() * (width * height) + id.y() * width + id.x()) * + tensor->info()->element_size()), + width * tensor->info()->element_size()); + }, + it); + } else { // tensor-->mem + arm_compute::execute_window_loop( + window, + [&](const arm_compute::Coordinates& id) { + memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + + id.z() * (width * height) + id.y() * width) * + tensor->info()->element_size()), + it.ptr(), width * tensor->info()->element_size()); + }, + it); + } + unmap(); + + return 0; +} + +} // namespace acl +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/mali/acl_operator.h b/src/operators/kernel/mali/acl_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..c2e13283b1c679d6dfc8972af5ace5e579d568e6 --- /dev/null +++ b/src/operators/kernel/mali/acl_operator.h @@ -0,0 +1,1144 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef ACL_OPERATOR_H_ +#define ACL_OPERATOR_H_ +#include +#include + +#if USE_ACL == 1 +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h" +#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" +#include "arm_compute/runtime/Tensor.h" + +#ifdef PADDLE_MOBILE_MALI_GPU +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLActivationLayer.h" +#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h" +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h" +#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" +#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h" +#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h" +#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" +#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" +#endif + +#ifdef USE_OPENGLES +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h" +#endif + +#include "acl_tensor.h" +#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001 +#define FLAGS_ENABLE_ACL_BNLL 0x00000002 +#define FLAGS_ENABLE_ACL_CONV 0x00000004 +#define FLAGS_ENABLE_ACL_FC 0x00000008 +#define FLAGS_ENABLE_ACL_LRN 0x00000010 +#define FLAGS_ENABLE_ACL_POOLING 0x00000020 +#define FLAGS_ENABLE_ACL_RELU 0x00000040 +#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080 +#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100 +#define FLAGS_ENABLE_ACL_TANH 0x00000200 +#define FLAGS_ENABLE_ACL_LC 0x00000400 +#define FLAGS_ENABLE_ACL_BN 0x00000800 +#define FLAGS_ENABLE_ACL_CONCAT 0x00001000 +extern unsigned int bypass_acl_class_layer; + +#ifdef USE_PROFILING +#include +#define NANO_SEC_CONV 1000000 + +#define MASK_LOG_APP_TIME 0x00000001 +#define MASK_LOG_ALLOCATE 0x00000002 +#define MASK_LOG_RUN 0x00000004 +#define MASK_LOG_CONFIG 0x00000008 +#define MASK_LOG_COPY 0x00000010 +#define MASK_LOG_ABSVAL 0x00000020 +#define MASK_LOG_BNLL 0x00000040 +#define MASK_LOG_CONV 0x00000080 +#define MASK_LOG_FC 0x00000100 +#define MASK_LOG_LRN 0x00000200 +#define MASK_LOG_POOLING 0x00000400 +#define MASK_LOG_RELU 0x00000800 +#define MASK_LOG_SIGMOID 0x00001000 +#define MASK_LOG_SOFTMAX 0x00002000 +#define MASK_LOG_TANH 0x00004000 +#define MASK_LOG_LC 0x00008000 +#define MASK_LOG_BN 0x00010000 +#define MASK_LOG_CONCAT 0x00020000 +#define APP_TIME_INFO MASK_LOG_APP_TIME, "time: \t" +#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate: \t\t" +#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t" +#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t" +#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t" +#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t" +#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t" +#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t" +#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t" +#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t" +#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_SIGMOID_INFO \ + MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_SOFTMAX_INFO \ + MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_TANH_INFO \ + MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_BN_INFO \ + MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_CONCAT_INFO \ + MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +extern unsigned int acl_log_flags; + +class logtime_util { + public: + logtime_util() { mask = 0; } + logtime_util(int mask_, const char *information_) { + setlogtime_info(mask_, information_); + } + void setlogtime_info(int mask_, const char *information_) { + mask = mask_; + if (acl_log_flags & mask) { + strncpy(information, information_, 255); + gettimeofday(&tv[0], NULL); + } + } + ~logtime_util() { + if (acl_log_flags & mask) { + int time[2]; + gettimeofday(&tv[1], NULL); + time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec; + time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec; + printf("%s %.6lf\n", information, + (((double)time[1] - time[0]) / NANO_SEC_CONV)); + } + } + void log_time(bool start) { + if (acl_log_flags & mask) { + if (start) { + gettimeofday(&tv[0], NULL); + } else { + int time[2]; + gettimeofday(&tv[1], NULL); + time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec; + time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec; + printf("%s %.6lf\n", information, + (((double)time[1] - time[0]) / NANO_SEC_CONV)); + } + } + } + + private: + struct timeval tv[2]; + int mask; + char information[256]; +}; + +#endif // USE_PROFILING + +namespace paddle_mobile { +namespace operators { +namespace acl { + +class AclParameters { + public: + AclParameters() { + dilated = false; + dim = 2; + num_group = 1; + } + int batch; + int in_depth; + int in_rows; + int in_cols; + + int out_depth; + int out_rows; + int out_cols; + int out_num; + + int filter_rows; + int filter_cols; + + int stride_rows; + int stride_cols; + + int pad_rows; + int pad_cols; + + int dilation_rows; + int dilation_cols; + + int num_group; + bool dilated; + int dim; + int epsilon; + + int nsize; + float alpha; + float beta; + float knorm; + + void *input_data; + void *output_data; + void *weight_data; + void *biases_data; + void *mean_data; + void *var_data; + + std::string pool_type; + std::string act_type; + std::string data_layout; + + bool is_global_pool; + bool is_channel_concat; + + std::vector in_tensor; +}; + +enum TensorType { + tensor_input, + tensor_output, + tensor_weights, + tensor_biases, + tensor_mean, + tensor_var, + tensor_beta, + tensor_gamma, + tensor_concat, + tensor_data, +}; +enum OperatorState { + operator_not_init, + operator_init_done, + operator_reinit, +}; +enum OperateType { + operate_type_pooling, + operate_type_activation, + operate_type_lrn, + operate_type_conv, + operate_type_lc, + operate_type_fc, + operate_type_bn, + operate_type_softmax, + operate_type_concat, +}; + +class BaseACLTensor { + public: + BaseACLTensor() : type_(tensor_input), allocate_(false) {} + virtual ~BaseACLTensor() {} + virtual void bindmem(void *mem) { mem_ = mem; } + virtual void settensortype(TensorType type) { type_ = type; } + virtual void map(bool blocking = true) {} + virtual void unmap() {} + virtual void commit(TensorType type = tensor_data) {} + int tensor_copy(arm_compute::ITensor *tensor, void *mem, + bool toTensor = true); + + protected: + void *mem_; + TensorType type_; + bool allocate_; +}; +class ACLTensor : public BaseACLTensor, public Tensor { + public: + explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {} + virtual void map(bool blocking = true) { + if (!allocate_) { + Tensor::allocate(); + allocate_ = true; + } + Tensor::map(blocking); + } + virtual int tensor_copy(void *mem, bool toTensor = true) { + auto acl_tensor = this; + arm_compute::ITensor *tensor = acl_tensor->tensor(); + BaseACLTensor::tensor_copy(tensor, mem, toTensor); + return 0; + } + virtual void unmap() { Tensor::unmap(); } + virtual void commit(TensorType type = tensor_data); +}; +class ACLSubTensor : public BaseACLTensor, public SubTensor { + public: + ACLSubTensor(std::unique_ptr &parent, + arm_compute::TensorShape &shape, arm_compute::Coordinates &coord) + : SubTensor(parent.get(), shape, coord) {} + virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; } +}; + +template +class TensorPair { + public: + TensorPair() {} + ~TensorPair() {} + TensorType type; + std::unique_ptr tensor; +}; +template +std::unique_ptr &tensor_item( + std::vector>> &pool, TensorType type, + int idx) { + int count = 0; + for (auto &item : pool) { + if (item.get()->type == type) { + ++count; + } + if (item.get()->type == type && idx == count - 1) { + return item.get()->tensor; + } + } + pool.push_back((std::unique_ptr>)std::move(new TensorPair)); + auto item = pool[pool.size() - 1].get(); + item->type = type; + item->tensor = NULL; + return item->tensor; +} +class ACLOperator { + public: + virtual void commit() { + for (auto &item : tensor_pool_) { + if (item.get()->tensor) item.get()->tensor->commit(item.get()->type); + } + } + inline void run() { + commit(); +#ifdef USE_PROFILING + logtime_util log_time(ACL_RUN_INFO); +#endif // USE_PROFILING + for (auto &c : funcs_) { + c->run(); + } + } + + inline std::vector> &funcs() { + return funcs_; + } + inline std::unique_ptr &sinput(int idx = 0) { + return tensor_item(subtensor_pool_, tensor_input, idx); + } + inline std::unique_ptr &soutput(int idx = 0) { + return tensor_item(subtensor_pool_, tensor_output, idx); + } + inline std::unique_ptr &sweights(int idx = 0) { + return tensor_item(subtensor_pool_, tensor_weights, idx); + } + inline std::unique_ptr &sbiases(int idx = 0) { + return tensor_item(subtensor_pool_, tensor_biases, idx); + } + inline std::unique_ptr &cinput(int idx = 0) { + return tensor_item(tensor_pool_, tensor_concat, idx); + } + inline std::unique_ptr &input(int idx = 0) { + return tensor_item(tensor_pool_, tensor_input, idx); + } + inline std::unique_ptr &output(int idx = 0) { + return tensor_item(tensor_pool_, tensor_output, idx); + } + inline std::unique_ptr &weights(int idx = 0) { + return tensor_item(tensor_pool_, tensor_weights, idx); + } + inline std::unique_ptr &biases(int idx = 0) { + return tensor_item(tensor_pool_, tensor_biases, idx); + } + inline std::unique_ptr &mean(int idx = 0) { + return tensor_item(tensor_pool_, tensor_mean, idx); + } + inline std::unique_ptr &var(int idx = 0) { + return tensor_item(tensor_pool_, tensor_var, idx); + } + inline std::unique_ptr &beta(int idx = 0) { + return tensor_item(tensor_pool_, tensor_beta, idx); + } + inline std::unique_ptr &gamma(int idx = 0) { + return tensor_item(tensor_pool_, tensor_gamma, idx); + } + inline std::unique_ptr &tensor(TensorType type) { + switch (type) { + case tensor_biases: + return biases(); + break; + case tensor_weights: + return weights(); + break; + case tensor_output: + return output(); + break; + default: + case tensor_input: + return input(); + break; + } + return input(); + } + + explicit ACLOperator(bool is_gpu = false); + virtual ~ACLOperator(); + inline TargetHint getTargetHint() { +#ifdef USE_OPENCL + if (target_hint_ == TargetHint::DONT_CARE) { + if (is_gpu_) { + return TargetHint::OPENCL; + } + return TargetHint::NEON; + } + return target_hint_; +#elif defined(USE_OPENGLES) + if (target_hint_ == TargetHint::DONT_CARE) { + if (is_gpu_) { + return TargetHint::OPENGLES; + } + return TargetHint::NEON; + } + return target_hint_; +#else + return TargetHint::NEON; +#endif + } + inline void setTargetHint(TargetHint hint) { target_hint_ = hint; } + inline ConvolutionMethodHint &getConvMethod() { + return convolution_method_hint_; + } + inline void setConvMethod() { + convolution_method_hint_ = ConvolutionMethodHint::DIRECT; + } + inline bool tensor_mem(std::unique_ptr &tensor, void *mem) { + tensor->bindmem(mem); + return true; + } + inline bool tensor_mem(void *mem, std::unique_ptr &tensor) { + tensor->tensor_copy(mem, false); + return true; + } + bool new_tensor(std::unique_ptr &tensor, + arm_compute::TensorShape &shape, void *mem = nullptr, + bool commit = false); + bool new_tensor(std::unique_ptr &tensor, + std::unique_ptr &parent, + arm_compute::TensorShape &shape, + arm_compute::Coordinates &coord); + inline int &group() { return _group; } + inline void set_operator_property(OperateType type, const char *name) { + name_ = name; + type_ = type; + } + inline void acl_run(void *input_data, void *output_data) { + if (input_data) tensor_mem(input(), input_data); + run(); + tensor_mem(output_data, output()); + } + inline int &input_idx() { return input_idx_; } + inline int &output_idx() { return output_idx_; } + + protected: + inline bool isGPUMode() { +#ifdef USE_OPENCL + if (!support_opencl_) return false; + return getTargetHint() == TargetHint::OPENCL; +#elif defined(USE_OPENGLES) + if (!support_opengles_) return false; + return getTargetHint() == TargetHint::OPENGLES; +#endif + return false; + } + inline OperatorState &opstate() { return operator_state_; } + inline bool is_operator_init_done(arm_compute::TensorShape shape, + TensorType type = tensor_input) { + checkreshape(shape, type); + return operator_state_ == operator_init_done; + } + inline void set_operator_init_done() { + opstate() = operator_init_done; + set_bypass_state(false); + } + inline void set_bypass_state(bool state = false) { + force_bypass_acl_path_ = state; + } + inline OperatorState checkreshape(arm_compute::TensorShape shape, + TensorType type = tensor_input) { + opstate() = reshape(shape, type); + if (opstate() == operator_reinit) { + freeres(); + } + return opstate(); + } + inline OperatorState reshape(arm_compute::TensorShape &shape, + TensorType type) { + arm_compute::TensorShape _shape; + std::unique_ptr &acl_tensor = tensor(type); + if (!acl_tensor.get()) return operator_not_init; + _shape = acl_tensor->info().tensor_shape(); + if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] && + _shape[1] == shape[1]) { + return operator_init_done; + } + return operator_reinit; + } + inline void freeres() { + tensor_pool_.clear(); + subtensor_pool_.clear(); + funcs_.clear(); + } + inline const char *&name() { return name_; } + inline void set_in_out_index(int indata_idx, int outdata_idx) { + input_idx() = indata_idx; + output_idx() = outdata_idx; + } + + protected: + std::vector>> tensor_pool_; + std::vector>> subtensor_pool_; + std::vector> funcs_; + OperatorState operator_state_; + bool force_bypass_acl_path_; + TargetHint target_hint_; + ConvolutionMethodHint convolution_method_hint_; + static bool support_opengles_; + static bool support_opencl_; + static bool init_gpu_env; + int _group; + const char *name_; + OperateType type_; + int input_idx_, output_idx_; + bool is_gpu_; +}; + +int isScheduleEnable(); + +template +std::unique_ptr instantiate_function( + arm_compute::ITensor *input, arm_compute::ITensor *output) { + auto op = cpp14::make_unique(); + op->configure(dynamic_cast(input), + dynamic_cast(output)); + + return std::move(op); +} + +template +std::unique_ptr instantiate( + arm_compute::ITensor *input, arm_compute::ITensor *output) { + return instantiate_function(input, output); +} + +template +std::unique_ptr instantiate_op_func( + std::unique_ptr &input, std::unique_ptr &output, + TargetHint &hint) { + std::unique_ptr func; + func = instantiate(input->tensor(), output->tensor()); + return func; +} + +template +std::unique_ptr instantiate_function( + VectorTensor inputs, arm_compute::ITensor *output) { + auto op = cpp14::make_unique(); + op->configure(inputs, dynamic_cast(output)); + + return std::move(op); +} + +template +std::unique_ptr instantiate( + VectorTensor inputs, arm_compute::ITensor *output) { + return instantiate_function(inputs, + output); +} + +template +std::unique_ptr instantiate_op_func_lists( + ACLOperator *&acl_op, std::unique_ptr &output, int num, + TargetHint &hint) { + std::unique_ptr func; + static std::vector tensors; + tensors.clear(); + for (int i = 0; i < num; ++i) { + tensors.push_back( + dynamic_cast(acl_op->cinput(i).get()->tensor())); + } + func = instantiate>( + tensors, output->tensor()); + return func; +} + +template +std::unique_ptr instantiate_function( + arm_compute::ITensor *input, arm_compute::ITensor *output, + const OperatorInfo &info) { + auto op = cpp14::make_unique(); + op->configure(dynamic_cast(input), + dynamic_cast(output), info); + + return std::move(op); +} + +template +std::unique_ptr instantiate( + arm_compute::ITensor *input, arm_compute::ITensor *output, + const OperatorInfo &info) { + return instantiate_function( + input, output, info); +} + +template +std::unique_ptr instantiate_op_func( + std::unique_ptr &input, std::unique_ptr &output, + const OperatorInfo &info, TargetHint &hint) { + std::unique_ptr func; + func = instantiate(input->tensor(), + output->tensor(), info); + return func; +} + +template +std::unique_ptr instantiate_function( + arm_compute::ITensor *input, arm_compute::ITensor *weights, + arm_compute::ITensor *biases, arm_compute::ITensor *output, + const OperatorInfo &info) { + auto op = cpp14::make_unique(); + op->configure(dynamic_cast(input), + dynamic_cast(weights), + dynamic_cast(biases), + dynamic_cast(output), info); + return std::move(op); +} + +template +std::unique_ptr instantiate( + arm_compute::ITensor *input, arm_compute::ITensor *weights, + arm_compute::ITensor *biases, arm_compute::ITensor *output, + const OperatorInfo &info) { + return instantiate_function( + input, weights, biases, output, info); +} + +template +std::unique_ptr instantiate_op_func( + std::unique_ptr &input, std::unique_ptr &weights, + std::unique_ptr &biases, std::unique_ptr &output, + const OperatorInfo &info, TargetHint &hint) { + std::unique_ptr func; + arm_compute::ITensor *biases_tensor = NULL; + + if (biases.get()) { + biases_tensor = biases->tensor(); + } + func = instantiate( + input->tensor(), weights->tensor(), biases_tensor, output->tensor(), + info); + return func; +} + +template +std::unique_ptr instantiate_function( + arm_compute::ITensor *input, arm_compute::ITensor *output, + arm_compute::ITensor *mean, arm_compute::ITensor *var, + arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) { + auto op = cpp14::make_unique(); + op->configure( + dynamic_cast(input), dynamic_cast(output), + dynamic_cast(mean), dynamic_cast(var), + dynamic_cast(beta), dynamic_cast(gamma), eps); + + return std::move(op); +} + +template +std::unique_ptr instantiate( + arm_compute::ITensor *input, arm_compute::ITensor *output, + arm_compute::ITensor *mean, arm_compute::ITensor *var, + arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) { + return instantiate_function( + input, output, mean, var, beta, gamma, eps); +} + +template +std::unique_ptr instantiate_op_func( + std::unique_ptr &input, std::unique_ptr &output, + std::unique_ptr &mean, std::unique_ptr &var, + std::unique_ptr &beta, std::unique_ptr &gamma, + Dtype eps, TargetHint hint) { + std::unique_ptr func; + func = instantiate( + input->tensor(), output->tensor(), mean->tensor(), var->tensor(), + beta->tensor(), gamma->tensor(), eps); + return func; +} + +template +bool instantiate_op_pooling( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, const OperatorInfo &info) { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back( + instantiate_op_func(input, output, info, + hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back( + instantiate_op_func(input, output, info, + hint)); + return true; + } +#endif + { + func.push_back( + instantiate_op_func(input, output, info, + hint)); + } + return true; +} +template +bool instantiate_op_activation( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, const OperatorInfo &info) { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back(instantiate_op_func( + input, output, info, hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back(instantiate_op_func( + input, output, info, hint)); + return true; + } +#endif + { + func.push_back(instantiate_op_func( + input, output, info, hint)); + } + return true; +} +template +bool instantiate_op_lrn( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, const OperatorInfo &info) { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back(instantiate_op_func( + input, output, info, hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back(instantiate_op_func( + input, output, info, hint)); + return true; + } +#endif + { + func.push_back(instantiate_op_func( + input, output, info, hint)); + } + return true; +} +template +bool instantiate_op_conv( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, const OperatorInfo &info) { + std::unique_ptr &weights = acl_op->weights(); + std::unique_ptr &biases = acl_op->biases(); + ConvolutionMethodHint &conv_method = acl_op->getConvMethod(); + bool has_biases = biases.get() ? true : false; + int &groups = acl_op->group(); + arm_compute::TensorShape input_shape = input->info().tensor_shape(); + arm_compute::TensorShape weights_shape = weights->info().tensor_shape(); + arm_compute::TensorShape biases_shape; + if (has_biases) { + biases_shape = biases->info().tensor_shape(); + } + arm_compute::TensorShape output_shape = output->info().tensor_shape(); + + if (groups == 1) { + if (conv_method == ConvolutionMethodHint::GEMM) { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back(instantiate_op_func( + acl_op->input(), acl_op->weights(), acl_op->biases(), + acl_op->output(), info, hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back(instantiate_op_func( + acl_op->input(), acl_op->weights(), acl_op->biases(), + acl_op->output(), info, hint)); + return true; + } +#endif + { + func.push_back(instantiate_op_func( + acl_op->input(), acl_op->weights(), acl_op->biases(), + acl_op->output(), info, hint)); + } + } else { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back( + instantiate_op_func( + acl_op->input(), acl_op->weights(), acl_op->biases(), + acl_op->output(), info, hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back( + instantiate_op_func( + acl_op->input(), acl_op->weights(), acl_op->biases(), + acl_op->output(), info, hint)); + return true; + } +#endif + { + func.push_back( + instantiate_op_func( + acl_op->input(), acl_op->weights(), acl_op->biases(), + acl_op->output(), info, hint)); + } + } + return true; + } + + // Calculate sub-tensor splits + const int input_split = input_shape.z() / groups; + const int output_split = output_shape.z() / groups; + const int weights_split = weights_shape[3] / groups; + const int biases_split = biases_shape.x() / groups; + + // Calculate sub-tensor shapes + input_shape.set(2, input_split); + output_shape.set(2, output_split); + weights_shape.set(3, weights_split); + biases_shape.set(0, biases_split); + + for (auto i = 0; i < groups; ++i) { + // Calculate sub-tensors starting coordinates + arm_compute::Coordinates input_coord(0, 0, input_split * i); + arm_compute::Coordinates output_coord(0, 0, output_split * i); + arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i); + arm_compute::Coordinates biases_coord(biases_split * i); + + // Create sub-tensors for input, output, weights and bias + acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape, + input_coord); + acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape, + output_coord); + acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape, + weights_coord); + if (has_biases) { + acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape, + biases_coord); + } + + bool use_opencl = false; + if (conv_method == ConvolutionMethodHint::GEMM) { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + use_opencl = true; + func.push_back( + instantiate_op_func( + acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), + acl_op->soutput(i), info, hint)); + } +#endif + if (!use_opencl) { + func.push_back( + instantiate_op_func( + acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), + acl_op->soutput(i), info, hint)); + } + } else { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + use_opencl = true; + func.push_back( + instantiate_op_func( + acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), + acl_op->soutput(i), info, hint)); + } +#endif + if (!use_opencl) { + func.push_back( + instantiate_op_func( + acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), + acl_op->soutput(i), info, hint)); + } + } + } + return true; +} +template +bool instantiate_op_lc( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, const OperatorInfo &info) { + std::unique_ptr &weights = acl_op->weights(); + std::unique_ptr &biases = acl_op->biases(); +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back( + instantiate_op_func( + input, weights, biases, output, info, hint)); + return true; + } +#endif + { + func.push_back( + instantiate_op_func( + input, weights, biases, output, info, hint)); + } + return true; +} +template +bool instantiate_op_fc( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, const OperatorInfo &info) { + std::unique_ptr &weights = acl_op->weights(); + std::unique_ptr &biases = acl_op->biases(); +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back(instantiate_op_func( + input, weights, biases, output, info, hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back(instantiate_op_func( + input, weights, biases, output, info, hint)); + return true; + } +#endif + { + func.push_back(instantiate_op_func( + input, weights, biases, output, info, hint)); + } + return true; +} +template +bool instantiate_op_bn( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, Dtype eps) { + std::unique_ptr &mean = acl_op->mean(); + std::unique_ptr &var = acl_op->var(); + std::unique_ptr &beta = acl_op->beta(); + std::unique_ptr &gamma = acl_op->gamma(); +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back( + instantiate_op_func(input, output, mean, var, + beta, gamma, eps, hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back( + instantiate_op_func(input, output, mean, var, + beta, gamma, eps, hint)); + return true; + } +#endif + { + func.push_back( + instantiate_op_func(input, output, mean, var, + beta, gamma, eps, hint)); + } + return true; +} +inline bool instantiate_op_softmax( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, void *data) { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back( + instantiate_op_func(input, output, hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back( + instantiate_op_func(input, output, hint)); + return true; + } +#endif + { + func.push_back( + instantiate_op_func( + input, output, hint)); + } + return true; +} +inline bool instantiate_op_concat( + ACLOperator *acl_op, + std::vector> &func, + std::unique_ptr &input, std::unique_ptr &output, + TargetHint hint, int num) { +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) { + func.push_back( + instantiate_op_func_lists(acl_op, output, num, + hint)); + return true; + } +#elif defined(USE_OPENGLES) + if (hint == TargetHint::OPENGLES) { + func.push_back( + instantiate_op_func_lists(acl_op, output, num, + hint)); + return true; + } +#endif + { + func.push_back( + instantiate_op_func_lists(acl_op, output, num, + hint)); + } + return true; +} +template +void *InputdataPtr(ACLOperator *op, + const std::vector &input_data, + Dtype type, int index = -1) { + if (index == -1) index = 0; + return (void *)(input_data[index]->mutable_data()); +} + +template +void acl_run(ACLOperator *op, + const std::vector &in_data, void *out_data, + Dtype type, bool multi_input_run = true) { + for (int i = 0; i < in_data.size(); ++i) { + op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i)); + } + op->acl_run(NULL, out_data); +} +} // namespace acl +} // namespace operators +} // namespace paddle_mobile + +#ifdef USE_PROFILING +#define acl_configure(opname, acl_op, args...) \ + { \ + set_operator_property(acl::operate_type_##opname, #opname); \ + logtime_util log_time(ACL_CONFIG_INFO); \ + instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \ + acl_op->output(), acl_op->getTargetHint(), args); \ + } +#else +#define acl_configure(opname, acl_op, args...) \ + { \ + set_operator_property(acl::operate_type_##opname, #opname); \ + instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \ + acl_op->output(), acl_op->getTargetHint(), args); \ + } +#endif + +#define ACLOp_Ptr(a) dynamic_cast(a) + +#endif // USE_ACL + +#endif // ACL_OPERATOR_H_ diff --git a/src/operators/kernel/mali/acl_tensor.cc b/src/operators/kernel/mali/acl_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..97a6add20a7ca1b9a6b4f9c9a7e6d1ba1f4e2e0a --- /dev/null +++ b/src/operators/kernel/mali/acl_tensor.cc @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "acl_tensor.h" + +namespace paddle_mobile { +namespace operators { +namespace acl { + +#ifdef USE_ACL +template +std::unique_ptr initialise_tensor( + arm_compute::TensorInfo &info) { + auto tensor = cpp14::make_unique(); + tensor->allocator()->init(info); + return std::move(tensor); +} + +template +void tensor_allocate(arm_compute::ITensor &tensor) { + auto itensor = dynamic_cast(&tensor); + itensor->allocator()->allocate(); +} + +Tensor::Tensor(arm_compute::TensorInfo &info) noexcept + : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {} + +Tensor::Tensor(Tensor &&src) noexcept + : _target(src._target), + _info(std::move(src._info)), + _tensor(std::move(src._tensor)) {} + +arm_compute::ITensor *Tensor::set_target(TargetHint target) { + switch (target) { +#ifdef USE_OPENCL + case TargetHint::OPENCL: + _tensor = initialise_tensor(_info); + break; +#elif defined(USE_OPENGLES) + case TargetHint::OPENGLES: + _tensor = initialise_tensor(_info); + break; +#endif + case TargetHint::NEON: + _tensor = initialise_tensor(_info); + break; + default: + break; + } + _target = target; + return _tensor.get(); +} + +void Tensor::allocate() { + switch (_target) { +#ifdef USE_OPENCL + case TargetHint::OPENCL: + tensor_allocate(*_tensor); + break; +#elif defined(USE_OPENGLES) + case TargetHint::OPENGLES: + tensor_allocate(*_tensor); + break; +#endif + case TargetHint::NEON: + tensor_allocate(*_tensor); + break; + default: + break; + } +} +void Tensor::map(bool blocking) { +#ifdef USE_OPENCL + if (_target == TargetHint::OPENCL) + dynamic_cast(tensor())->map(blocking); +#elif defined(USE_OPENGLES) + if (_target == TargetHint::OPENGLES) + dynamic_cast(tensor())->map(blocking); +#endif +} +void Tensor::unmap() { +#ifdef USE_OPENCL + if (_target == TargetHint::OPENCL) + dynamic_cast(tensor())->unmap(); +#elif defined(USE_OPENGLES) + if (_target == TargetHint::OPENGLES) + dynamic_cast(tensor())->unmap(); +#endif +} + +template +std::unique_ptr initialise_subtensor( + arm_compute::ITensor *parent, arm_compute::TensorShape shape, + arm_compute::Coordinates coords) { + auto ptensor = dynamic_cast(parent); + auto subtensor = cpp14::make_unique(ptensor, shape, coords); + return std::move(subtensor); +} +SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape, + arm_compute::Coordinates &coords) noexcept + : _target(TargetHint::DONT_CARE), + _tensor_shape(tensor_shape), + _coords(coords), + _parent(nullptr), + _subtensor(nullptr) { + _parent = parent->tensor(); + _target = parent->target(); + + instantiate_subtensor(); +} +arm_compute::ITensor *SubTensor::set_target(TargetHint target) { + return (target == _target) ? _subtensor.get() : nullptr; +} + +arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); } + +const arm_compute::ITensor *SubTensor::tensor() const { + return _subtensor.get(); +} + +TargetHint SubTensor::target() const { return _target; } + +void SubTensor::allocate() { + // NOP for sub-tensors +} + +void SubTensor::instantiate_subtensor() { + switch (_target) { +#ifdef USE_OPENCL + case TargetHint::OPENCL: + _subtensor = initialise_subtensor( + _parent, _tensor_shape, _coords); + break; +#endif + default: + case TargetHint::NEON: + _subtensor = + initialise_subtensor( + _parent, _tensor_shape, _coords); + break; + } +} + +#endif + +} // namespace acl +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/mali/acl_tensor.h b/src/operators/kernel/mali/acl_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..1d4f59371e355ddd2e89a709eec0b5451c1c3502 --- /dev/null +++ b/src/operators/kernel/mali/acl_tensor.h @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef ACL_TENSOR_H_ +#define ACL_TENSOR_H_ + +#ifdef USE_ACL +#ifdef USE_OPENCL +#include "arm_compute/runtime/CL/CLSubTensor.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#elif defined(USE_OPENGLES) +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#endif +#include "arm_compute/runtime/SubTensor.h" +#include "arm_compute/runtime/Tensor.h" + +#include + +namespace paddle_mobile { +namespace operators { +namespace acl { +enum class TargetHint { + DONT_CARE, + OPENCL, + OPENGLES, + NEON, +}; + +enum class ConvolutionMethodHint { + GEMM, + DIRECT, +}; +namespace cpp14 { +template +struct _Unique_if { + typedef std::unique_ptr _Single_object; +}; + +template +struct _Unique_if { + typedef std::unique_ptr _Unknown_bound; +}; + +template +struct _Unique_if { + typedef void _Known_bound; +}; + +template +typename _Unique_if::_Single_object make_unique(Args &&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +template +typename _Unique_if::_Unknown_bound make_unique(size_t n) { + typedef typename std::remove_extent::type U; + return std::unique_ptr(new U[n]()); +} + +template +typename _Unique_if::_Known_bound make_unique(Args &&...); +} // namespace cpp14 + +class Tensor { + public: + explicit Tensor(arm_compute::TensorInfo &info) noexcept; + virtual ~Tensor() {} + Tensor(Tensor &&src) noexcept; + void set_info(arm_compute::TensorInfo &&info) { _info = info; } + arm_compute::ITensor *set_target(TargetHint target); + const arm_compute::TensorInfo &info() const { return _info; } + arm_compute::ITensor *tensor() { return _tensor.get(); } + void allocate(); + void init() {} + TargetHint target() const { return _target; } + virtual void map(bool blocking = true); + virtual void unmap(); + + private: + TargetHint _target; + arm_compute::TensorInfo _info; + std::unique_ptr _tensor; +}; + +class SubTensor { + public: + SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape, + arm_compute::Coordinates &coords) noexcept; + ~SubTensor() {} + arm_compute::ITensor *tensor(); + const arm_compute::ITensor *tensor() const; + TargetHint target() const; + void allocate(); + arm_compute::ITensor *set_target(TargetHint target); + + private: + /** Instantiates a sub-tensor */ + void instantiate_subtensor(); + + private: + /**< Target that this tensor is pinned on */ + TargetHint _target; + /**< SubTensor shape */ + arm_compute::TensorShape _tensor_shape; + /**< SubTensor Coordinates */ + arm_compute::Coordinates _coords; + /**< Parent tensor */ + arm_compute::ITensor *_parent; + /**< SubTensor */ + std::unique_ptr _subtensor; +}; + +} // namespace acl +} // namespace operators +} // namespace paddle_mobile +#endif +#endif // ACL_TENSOR_H_ diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp index c816855f0293bc22321aaef885ae82610dce4d86..53baf82a6fcf77f5d68c0cd4b31236ad6afa19ab 100644 --- a/src/operators/kernel/mali/batchnorm_kernel.cpp +++ b/src/operators/kernel/mali/batchnorm_kernel.cpp @@ -16,20 +16,152 @@ limitations under the License. */ #include "operators/kernel/batchnorm_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { +template +class AclBatchNormOp : public acl::ACLOperator { + public: + AclBatchNormOp() { + this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN; + } + ~AclBatchNormOp() = default; + AclBatchNormOp(const AclBatchNormOp&) = delete; + AclBatchNormOp& operator=(const AclBatchNormOp&) = delete; + AclBatchNormOp(AclBatchNormOp&&) = delete; + AclBatchNormOp& operator=(AclBatchNormOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + void InitAclLayer(const BatchNormParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + arm_compute::TensorShape input_shape(args.in_cols, args.in_rows, + args.in_depth, args.batch); + arm_compute::TensorShape output_shape(args.out_cols, args.out_rows, + args.out_depth, args.out_num); + + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + + arm_compute::TensorShape mean_shape(args.in_depth); + arm_compute::TensorShape var_shape = mean_shape; + arm_compute::TensorShape beta_shape = mean_shape; + arm_compute::TensorShape gamma_shape = mean_shape; + + //[width, height, IFM] + new_tensor(input(), input_shape, args.input_data); + //[width, height, OFM] + new_tensor(output(), output_shape, args.output_data); + + new_tensor(mean(), mean_shape, args.mean_data); + new_tensor(var(), var_shape, args.var_data); + new_tensor(beta(), beta_shape, args.biases_data); + new_tensor(gamma(), gamma_shape, args.weight_data); + + acl_configure(bn, this, args.epsilon); + } + + void RunAcl(void* input, void* output) { + acl::ACLOperator::acl_run(input, output); + } + bool Bypass_acl(const BatchNormParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_) { + bypass_acl = true; + } + + return bypass_acl; + } + + private: + void AclParametersByContext(const BatchNormParam& param) { + const Tensor* in_x = param.InputX(); + Tensor* out = param.OutputY(); + const Tensor* scale = param.InputScale(); + const Tensor* bias = param.InputBias(); + const Tensor* saved_mean = param.InputMean(); + const Tensor* saved_variance = param.InputVariance(); + + const T* input_data = in_x->data(); + T* output_data = out->mutable_data(); + const T* weight_data = scale->data(); + const T* bias_data = bias->data(); + const T* mean_data = saved_mean->data(); + const T* var_data = saved_variance->data(); + + float epsilon = param.Epsilon(); + + args.input_data = (void*)input_data; + args.output_data = (void*)output_data; + // args.weight_data = (void*)weight_data; + // args.biases_data = (void*)bias_data; + args.mean_data = (void*)mean_data; + args.var_data = (void*)var_data; + args.epsilon = epsilon; + + args.dim = in_x->dims().size(); + + args.batch = in_x->dims()[0]; + args.in_depth = in_x->dims()[1]; + args.in_rows = in_x->dims()[2]; + args.in_cols = in_x->dims()[3]; + + args.out_num = out->dims()[0]; + args.out_depth = out->dims()[1]; + args.out_rows = out->dims()[2]; + args.out_cols = out->dims()[3]; + + args.weight_data = (void*)weight_data; + args.biases_data = (void*)bias_data; + + // std::cout + // << "Out C: " << args.out_depth + // << " H: " << args.out_rows << " W: " << args.out_cols << "\n"; + } + acl::AclParameters args; +}; + template <> -bool BatchNormKernel::Init(const BatchNormParam ¶) const { +bool BatchNormKernel::Init(const BatchNormParam& param) const { + AclBatchNormOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclBatchNormOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } return true; } template <> void BatchNormKernel::Compute( - const BatchNormParam ¶m) const {} + const BatchNormParam& param) const { + std::cout << "init acl" << std::endl; + AclBatchNormOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + const float* input_data = (const float*)args.input_data; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + acl_op->RunAcl((void*)input_data, (void*)output_data); +} +template class BatchNormKernel; } // namespace operators } // namespace paddle_mobile #endif +#endif diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..08ee58d41577dfb5fd3a99755d66b5677b7b7ed2 --- /dev/null +++ b/src/operators/kernel/mali/concat_kernel.cpp @@ -0,0 +1,140 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONCAT_OP + +#include "operators/kernel/concat_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class AclConcatOp : public acl::ACLOperator { + public: + AclConcatOp() { + this->force_bypass_acl_path_ = + bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT; + } + ~AclConcatOp() = default; + AclConcatOp(const AclConcatOp&) = delete; + AclConcatOp& operator=(const AclConcatOp&) = delete; + AclConcatOp(AclConcatOp&&) = delete; + AclConcatOp& operator=(AclConcatOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + + void InitAclLayer(const ConcatParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + const std::vector* input_data = &args.in_tensor; + arm_compute::TensorShape output_shape(args.out_cols, args.out_rows, + args.out_depth, args.batch); + + if (is_operator_init_done(output_shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + T type; + + for (int i = 0; i < input_data->size(); i++) { + const T* idata = (*input_data)[i]->data(); + const T* pdata = (*input_data)[i]->data(); + int in_batch = (*input_data)[i]->dims()[0]; + int in_channels = (*input_data)[i]->dims()[1]; + int in_width = (*input_data)[i]->dims()[2]; + int in_height = (*input_data)[i]->dims()[3]; + arm_compute::TensorShape in_shape(in_width, in_height, in_channels); + + new_tensor(cinput(i), in_shape, + acl::InputdataPtr(this, args.in_tensor, type, i)); + } + + //[width, height, OFM] + new_tensor(output(), output_shape, args.output_data); + + acl_configure(concat, this, input_data->size()); + } + + void RunAcl(const std::vector& input, void* output) { + T type; + acl::acl_run(this, input, output, type); + } + bool Bypass_acl(const ConcatParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_ || !args.is_channel_concat) { + bypass_acl = true; + } + return bypass_acl; + } + + private: + void AclParametersByContext(const ConcatParam& param) { + auto inputs = param.Inputs(); + auto* output = param.Out(); + int64_t axis = param.Axis(); + + T* output_data = output->mutable_data(); + + args.is_channel_concat = (axis == 1); + args.in_tensor = inputs; + args.output_data = (void*)output_data; + + args.batch = output->dims()[0]; + args.out_depth = output->dims()[1]; + args.out_rows = output->dims()[2]; + args.out_cols = output->dims()[3]; + } + acl::AclParameters args; +}; + +template <> +bool ConcatKernel::Init(const ConcatParam& param) const { + AclConcatOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclConcatOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } + return true; +} + +template <> +void ConcatKernel::Compute(const ConcatParam& param) const { + std::cout << "init acl" << std::endl; + AclConcatOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + std::vector temp_data = args.in_tensor; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + acl_op->RunAcl(temp_data, (void*)output_data); +} + +template class ConcatKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif +#endif diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1d34910231c086673c58d8dba2c1e44992b5d593 --- /dev/null +++ b/src/operators/kernel/mali/conv_add_kernel.cpp @@ -0,0 +1,235 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADD_OP + +#include "operators/kernel/conv_add_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class AclConvAddOp : public acl::ACLOperator { + public: + AclConvAddOp() { + this->force_bypass_acl_path_ = + bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV; + } + ~AclConvAddOp() = default; + AclConvAddOp(const AclConvAddOp&) = delete; + AclConvAddOp& operator=(const AclConvAddOp&) = delete; + AclConvAddOp(AclConvAddOp&&) = delete; + AclConvAddOp& operator=(AclConvAddOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + void InitAclLayer(const FusionConvAddParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + arm_compute::TensorShape input_shape(args.in_cols, args.in_rows, + args.in_depth, args.batch); + arm_compute::TensorShape output_shape(args.out_cols, args.out_rows, + args.out_depth, args.out_num); + arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows, + args.in_depth / args.num_group, + args.out_depth); + arm_compute::TensorShape biases_shape(args.out_depth); + arm_compute::PadStrideInfo conv_info( + args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows, + arm_compute::DimensionRoundingType::FLOOR); + + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + + check_direct_conv(); + //[kernel_x, kernel_y, IFM, OFM] + new_tensor(weights(), weights_shape, args.weight_data); + //[OFM] + if (args.biases_data) { + new_tensor(biases(), biases_shape, args.biases_data); + } + + group() = args.num_group; + + //[width, height, IFM] + new_tensor(input(), input_shape, args.input_data); + //[width, height, OFM] + new_tensor(output(), output_shape, args.output_data); + + acl_configure(conv, this, conv_info); + } + + void RunAcl(void* input, void* output) { + acl::ACLOperator::acl_run(input, output); + } + bool Bypass_acl(const FusionConvAddParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_ || args.num_group >= 5) { + bypass_acl = true; + } + if (args.dim > 2) { + bypass_acl = true; + } + if (args.dilated) { + bypass_acl = true; + } + return bypass_acl; + } + + private: + void check_direct_conv() { + bool use_direct_conv = false; + const char* pDirectConv; + pDirectConv = getenv("DIRECTCONV"); + if (pDirectConv) { + unsigned int bdirectconv; + sscanf(pDirectConv, "%i", &bdirectconv); + if (bdirectconv != use_direct_conv) { + use_direct_conv = bdirectconv; + printf("DIRECTCONV<%s>\n", pDirectConv); + printf("DIRECTCONV: %x\n", use_direct_conv); + } + } + int pad_data[2], kernel[2]; + pad_data[1] = args.pad_rows; + pad_data[0] = args.pad_cols; + kernel[1] = args.filter_rows; + kernel[0] = args.filter_cols; + if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 && + pad_data[0] == 0 && pad_data[1] == 0) || + (kernel[0] == 3 && kernel[1] == 3 && + pad_data[0] <= 1 && pad_data[1] <= 1))) { + setConvMethod(); // NEDirectConvolutionLayer only for 1x1 and 3x3 + } + } + + void AclParametersByContext(const FusionConvAddParam& param) { + const Tensor* input = param.Input(); + Tensor filter = *param.Filter(); + Tensor* output = param.Output(); + Tensor* bias; + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(); + const T* weight_data = filter.data(); + + args.input_data = (void*)input_data; + args.output_data = (void*)output_data; + args.weight_data = (void*)weight_data; + args.biases_data = nullptr; + + try { + bias = param.Bias(); + } catch (const std::exception& e) { + } + if (bias) { + const T* biases_data = bias->data(); + args.biases_data = (void*)biases_data; + } + + args.num_group = groups; + + args.dilation_rows = dilations[0]; + args.dilation_cols = dilations[1]; + if (dilations[0] != 1 || dilations[1] != 1) { + args.dilated = true; + } + + // NCHW + // std::cout << "In dims: " << (input->dims()).size() << std::endl; + args.batch = input->dims()[0]; + args.in_depth = input->dims()[1]; + args.in_rows = input->dims()[2]; + args.in_cols = input->dims()[3]; + // std::cout <<"In N: " << args.batch << " C: " << args.in_depth + // << " H: " << args.in_rows << " W: " << args.in_cols << "\n"; + // NCHW + // std::cout << "Out dims: " << (output->dims()).size() << std::endl; + args.out_num = output->dims()[0]; + args.out_depth = output->dims()[1]; + args.out_rows = output->dims()[2]; + args.out_cols = output->dims()[3]; + // std::cout <<"Out N: " << static_cast(output->dims()[0]) + // << " C: " << args.out_depth + // << " H: " << args.out_rows << " W: " << args.out_cols << "\n"; + // MCHW = OIHW + args.filter_rows = filter.dims()[2]; + args.filter_cols = filter.dims()[3]; + // std::cout <<"Filter O: " << static_cast(filter.dims()[0]) + // << " I: " << static_cast(filter.dims()[1]) + // << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n"; + + // strides(h_stride, w_stride) + args.stride_rows = strides[0]; + args.stride_cols = strides[1]; + // std::cout <<"Stride H: " << args.stride_rows << " W: " << + // args.stride_cols << "\n"; + + // paddings(h_pad, w_pad) + args.pad_rows = paddings[0]; + args.pad_cols = paddings[1]; + // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols << + // "\n"; + } + acl::AclParameters args; +}; + +template <> +bool ConvAddKernel::Init( + const FusionConvAddParam& param) const { + AclConvAddOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclConvAddOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } + return true; +} + +template <> +void ConvAddKernel::Compute( + const FusionConvAddParam& param) const { + std::cout << "init acl" << std::endl; + AclConvAddOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + const float* input_data = (const float*)args.input_data; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + acl_op->RunAcl((void*)input_data, (void*)output_data); +} + +template class ConvAddKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif +#endif diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp index a98b9d8bdf2748d6bb942aa1ab3dbd76ffdad913..f3212cae970b2a554412f59cf48a6e5156463969 100644 --- a/src/operators/kernel/mali/conv_kernel.cpp +++ b/src/operators/kernel/mali/conv_kernel.cpp @@ -15,20 +15,213 @@ limitations under the License. */ #ifdef CONV_OP #include "operators/kernel/conv_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { +template +class AclConvOp : public acl::ACLOperator { + public: + AclConvOp() { + this->force_bypass_acl_path_ = + bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV; + } + ~AclConvOp() = default; + AclConvOp(const AclConvOp&) = delete; + AclConvOp& operator=(const AclConvOp&) = delete; + AclConvOp(AclConvOp&&) = delete; + AclConvOp& operator=(AclConvOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + void InitAclLayer(const ConvParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + arm_compute::TensorShape input_shape(args.in_cols, args.in_rows, + args.in_depth, args.batch); + arm_compute::TensorShape output_shape(args.out_cols, args.out_rows, + args.out_depth, args.out_num); + arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows, + args.in_depth / args.num_group, + args.out_depth); + // arm_compute::TensorShape biases_shape(args.out_depth); + arm_compute::PadStrideInfo conv_info( + args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows, + arm_compute::DimensionRoundingType::FLOOR); + + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + + check_direct_conv(); + //[kernel_x, kernel_y, IFM, OFM] + new_tensor(weights(), weights_shape, args.weight_data); + //[OFM] + // if (args.biases_data) { + // new_tensor(biases(),biases_shape,args.biases_data); + //} + + group() = args.num_group; + + //[width, height, IFM] + new_tensor(input(), input_shape, args.input_data); + //[width, height, OFM] + new_tensor(output(), output_shape, args.output_data); + + acl_configure(conv, this, conv_info); + } + + void RunAcl(void* input, void* output) { + acl::ACLOperator::acl_run(input, output); + } + bool Bypass_acl(const ConvParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_ || args.num_group >= 5) { + bypass_acl = true; + } + if (args.dim > 2) { + bypass_acl = true; + } + if (args.dilated) { + bypass_acl = true; + } + return bypass_acl; + } + + private: + void check_direct_conv() { + bool use_direct_conv = false; + const char* pDirectConv; + pDirectConv = getenv("DIRECTCONV"); + if (pDirectConv) { + unsigned int bdirectconv; + sscanf(pDirectConv, "%i", &bdirectconv); + if (bdirectconv != use_direct_conv) { + use_direct_conv = bdirectconv; + printf("DIRECTCONV<%s>\n", pDirectConv); + printf("DIRECTCONV: %x\n", use_direct_conv); + } + } + int pad_data[2], kernel[2]; + pad_data[1] = args.pad_rows; + pad_data[0] = args.pad_cols; + kernel[1] = args.filter_rows; + kernel[0] = args.filter_cols; + if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 && + pad_data[0] == 0 && pad_data[1] == 0) || + (kernel[0] == 3 && kernel[1] == 3 && + pad_data[0] <= 1 && pad_data[1] <= 1))) { + setConvMethod(); // NEDirectConvolutionLayer only for 1x1 and 3x3 + } + } + + void AclParametersByContext(const ConvParam& param) { + const Tensor* input = param.Input(); + Tensor filter = *param.Filter(); + Tensor* output = param.Output(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(); + const T* weight_data = filter.data(); + + args.input_data = (void*)input_data; + args.output_data = (void*)output_data; + args.weight_data = (void*)weight_data; + args.biases_data = nullptr; + + // try { + // bias = context.Input("Bias"); + // } catch (const std::exception& e) { + // } + // if (bias) { + // const T* biases_data = bias->data(); + // args.biases_data = (void*)biases_data; + // } + + args.num_group = groups; + + args.dilation_rows = dilations[0]; + args.dilation_cols = dilations[1]; + if (dilations[0] != 1 || dilations[1] != 1) { + args.dilated = true; + } + + // NCHW + // std::cout << "In dims: " << (input->dims()).size() << std::endl; + args.batch = input->dims()[0]; + args.in_depth = input->dims()[1]; + args.in_rows = input->dims()[2]; + args.in_cols = input->dims()[3]; + std::cout << "In N: " << args.batch << " C: " << args.in_depth + << " H: " << args.in_rows << " W: " << args.in_cols << "\n"; + // NCHW + // std::cout << "Out dims: " << (output->dims()).size() << std::endl; + args.out_num = output->dims()[0]; + args.out_depth = output->dims()[1]; + args.out_rows = output->dims()[2]; + args.out_cols = output->dims()[3]; + // std::cout <<"Out N: " << static_cast(output->dims()[0]) + // << " C: " << args.out_depth + // << " H: " << args.out_rows << " W: " << args.out_cols << "\n"; + // MCHW = OIHW + args.filter_rows = filter.dims()[2]; + args.filter_cols = filter.dims()[3]; + // std::cout <<"Filter O: " << static_cast(filter.dims()[0]) + // << " I: " << static_cast(filter.dims()[1]) + // << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n"; + + // strides(h_stride, w_stride) + args.stride_rows = strides[0]; + args.stride_cols = strides[1]; + // std::cout <<"Stride H: " << args.stride_rows << " W: " << + // args.stride_cols << "\n"; + + // paddings(h_pad, w_pad) + args.pad_rows = paddings[0]; + args.pad_cols = paddings[1]; + // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols << + // "\n"; + } + acl::AclParameters args; +}; + template <> -bool ConvKernel::Init(const ConvParam ¶) const { +bool ConvKernel::Init(const ConvParam& param) const { + AclConvOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclConvOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } return true; } template <> -void ConvKernel::Compute(const ConvParam ¶m) const { - // ArmConvImplement imp; - // imp.Compute(param); - param.Output()->mutable_data()[0] = 100.0; +void ConvKernel::Compute(const ConvParam& param) const { + std::cout << "init acl" << std::endl; + AclConvOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + const float* input_data = (const float*)args.input_data; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + acl_op->RunAcl((void*)input_data, (void*)output_data); } template class ConvKernel; @@ -36,3 +229,4 @@ template class ConvKernel; } // namespace paddle_mobile #endif +#endif diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..43d33b3fd2b2cc747ae8c943437e675c84a4cdc6 --- /dev/null +++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEADD_OP + +#pragma once + +#include "operators/kernel/elementwise_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template +struct AddFunctor { + inline T operator()(T a, T b) const { return a + b; } +}; + +template <> +bool ElementwiseAddKernel::Init( + const ElementwiseAddParam ¶) const { + return true; +} + +template <> +void ElementwiseAddKernel::Compute( + const ElementwiseAddParam ¶m) const { + const Tensor *input_x = param.InputX(); + const Tensor *input_y = param.InputY(); + Tensor *Out = param.Out(); + Out->mutable_data(); + int axis = param.Axis(); + ElementwiseComputeEx, float>(input_x, input_y, axis, + AddFunctor(), Out); +} + +template class ElementwiseAddKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..64ab07a9b955893c01e2684cba0a14fa25d032ed --- /dev/null +++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_FC_OP + +#pragma once + +#include "operators/kernel/fusion_fc_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FusionFcKernel::Init(const FusionFcParam ¶) const { + return true; +} + +template <> +void FusionFcKernel::Compute( + const FusionFcParam ¶m) const { + const Tensor *input_x = param.InputX(); + const Tensor *input_y = param.InputY(); + const Tensor *input_z = param.InputZ(); + auto *input_z_data = input_z->data(); + int axis = param.Axis(); + Tensor *out = param.Out(); + auto *out_data = out->mutable_data(); + const Tensor x_matrix = + input_x->dims().size() > 2 + ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) + : *input_x; + const Tensor y_matrix = + input_y->dims().size() > 2 + ? framework::ReshapeToMatrix(*input_y, param.YNumColDims()) + : *input_y; + auto out_dim = out->dims(); + if (out_dim.size() != 2) { + out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } + PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); + PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1"); + PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0], + " out_dim.size must be 2."); + axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis); + PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ") + + int64_t classes = input_z->numel(); + for (int i = 0; i < out_dim[0]; i++) { + memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes); + } + + for (int i = 0; i < out->numel(); i++) { + DLOG << out_data[i]; + } + math::matmul(x_matrix, false, y_matrix, false, static_cast(1), + out, static_cast(1)); + PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); + // if (out_dim.size() != 2) { + // out->Resize(out_dim); + // } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c063ec8783382ccef79086368df8a97320010c23 --- /dev/null +++ b/src/operators/kernel/mali/lrn_kernel.cpp @@ -0,0 +1,148 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef LRN_OP + +#pragma once + +#include "operators/kernel/lrn_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class AclLrnOp : public acl::ACLOperator { + public: + AclLrnOp() { + this->force_bypass_acl_path_ = + bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN; + } + ~AclLrnOp() = default; + AclLrnOp(const AclLrnOp&) = delete; + AclLrnOp& operator=(const AclLrnOp&) = delete; + AclLrnOp(AclLrnOp&&) = delete; + AclLrnOp& operator=(AclLrnOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + void InitAclLayer(const LrnParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth); + + if (is_operator_init_done(shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + + arm_compute::NormalizationLayerInfo norm_info( + arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta, + args.knorm); + + //[width, height, IFM] + new_tensor(input(), shape, args.input_data); + //[width, height, OFM] + new_tensor(output(), shape, args.output_data); + + acl_configure(lrn, this, norm_info); + } + + void RunAcl(void* input, void* output) { + acl::ACLOperator::acl_run(input, output); + } + bool Bypass_acl(const LrnParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_) { + bypass_acl = true; + } + + return bypass_acl; + } + + private: + void AclParametersByContext(const LrnParam& param) { + const Tensor* in_x = param.InputX(); + Tensor* out = param.Out(); + + int n = param.N(); + T alpha = param.Alpha(); + T beta = param.Beta(); + T k = param.K(); + + const T* input_data = in_x->data(); + T* output_data = out->mutable_data(); + + args.input_data = (void*)input_data; + args.output_data = (void*)output_data; + + args.nsize = n; + args.alpha = alpha; + args.beta = beta; + args.knorm = k; + + // NCHW + args.batch = in_x->dims()[0]; + args.in_depth = in_x->dims()[1]; + args.in_rows = in_x->dims()[2]; + args.in_cols = in_x->dims()[3]; + // std::cout + // << "Out C: " << args.out_depth + // << " H: " << args.out_rows << " W: " << args.out_cols << "\n"; + } + acl::AclParameters args; +}; + +template <> +bool LrnKernel::Init(const LrnParam& param) const { + AclLrnOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclLrnOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } + return true; +} + +template <> +void LrnKernel::Compute(const LrnParam& param) const { + std::cout << "init acl" << std::endl; + AclLrnOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + const float* input_data = (const float*)args.input_data; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + for (int n = 0; n < args.batch; ++n) { + acl_op->RunAcl((void*)input_data, (void*)output_data); + input_data += args.in_depth * args.in_cols * args.in_rows; + output_data += args.in_depth * args.in_cols * args.in_rows; + } +} + +template class LrnKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif +#endif diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f2a84deaa1de999e94e335de6d4f40981bded5a8 --- /dev/null +++ b/src/operators/kernel/mali/mul_kernel.cpp @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef MUL_OP + +#pragma once + +#include "operators/kernel/mul_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool MulKernel::Init(const MulParam ¶) const { + return true; +} + +template <> +void MulKernel::Compute(const MulParam ¶m) const { + const Tensor *input_x = param.InputX(); + const Tensor *input_y = param.InputY(); + Tensor *out = param.Out(); + out->mutable_data(); + const Tensor x_matrix = + input_x->dims().size() > 2 + ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) + : *input_x; + const Tensor y_matrix = + input_y->dims().size() > 2 + ? framework::ReshapeToMatrix(*input_y, param.YNumColDims()) + : *input_y; + auto out_dim = out->dims(); + if (out_dim.size() != 2) { + out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } + math::matmul(x_matrix, false, y_matrix, false, static_cast(1), + out, static_cast(0)); + if (out_dim.size() != 2) { + out->Resize(out_dim); + } +} + +template class MulKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9de90deebca05ef50cf94fa958f37bbcf1a08c4b --- /dev/null +++ b/src/operators/kernel/mali/pool_kernel.cpp @@ -0,0 +1,220 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef POOL_OP + +#pragma once + +#include "operators/kernel/pool_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class AclPoolOp : public acl::ACLOperator { + public: + AclPoolOp() { + this->force_bypass_acl_path_ = + bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING; + } + ~AclPoolOp() = default; + AclPoolOp(const AclPoolOp&) = delete; + AclPoolOp& operator=(const AclPoolOp&) = delete; + AclPoolOp(AclPoolOp&&) = delete; + AclPoolOp& operator=(AclPoolOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + void InitAclLayer(const PoolParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + arm_compute::TensorShape input_shape(args.in_cols, args.in_rows, + args.in_depth); + arm_compute::TensorShape output_shape(args.out_cols, args.out_rows, + args.out_depth); + // arm_compute::TensorShape weights_shape( + // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth); + // arm_compute::TensorShape biases_shape(args.out_depth); + arm_compute::PoolingLayerInfo pool_info; + + if (args.pool_type == "max") { + pool_info = arm_compute::PoolingLayerInfo( + arm_compute::PoolingType::MAX, args.filter_rows, + arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows, + args.pad_cols, args.pad_rows, + arm_compute::DimensionRoundingType::CEIL)); + } else { + pool_info = arm_compute::PoolingLayerInfo( + arm_compute::PoolingType::AVG, args.filter_rows, + arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows, + args.pad_cols, args.pad_rows, + arm_compute::DimensionRoundingType::CEIL)); + } + + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + + //[width, height, IFM] + new_tensor(input(), input_shape, args.input_data); + //[width, height, OFM] + new_tensor(output(), output_shape, args.output_data); + + acl_configure(pooling, this, pool_info); + } + + void RunAcl(void* input, void* output) { + acl::ACLOperator::acl_run(input, output); + } + bool Bypass_acl(const PoolParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_) { + bypass_acl = true; + } + if (args.pool_type != "max" && args.pool_type != "avg") { + bypass_acl = true; + } + if (args.filter_rows != args.filter_cols) { + bypass_acl = true; + } + // if (args.filter_rows!=2 && args.filter_rows!=3) { + // bypass_acl = true; + // } + return bypass_acl; + } + + private: + void AclParametersByContext(const PoolParam& param) { + const Tensor* in_x = param.Input(); + Tensor* out = param.Output(); + std::string pooling_type = param.PoolingType(); + + std::vector ksize = param.Ksize(); + + std::vector strides = param.Strides(); + + std::vector paddings = param.Paddings(); + + bool is_global_pooling = param.isGlobalPooling(); + + const T* input_data = in_x->data(); + T* output_data = out->mutable_data(); + + args.input_data = (void*)input_data; + args.output_data = (void*)output_data; + + args.is_global_pool = is_global_pooling; + args.pool_type = pooling_type; + + args.filter_rows = ksize[0]; + args.filter_cols = ksize[1]; + args.dim = ksize.size(); + + // NCHW + args.batch = in_x->dims()[0]; + args.in_depth = in_x->dims()[1]; + args.in_rows = in_x->dims()[2]; + args.in_cols = in_x->dims()[3]; + // std::cout <<"In N: " << args.batch << " C: " << args.in_depth + // << " H: " << args.in_rows << " W: " << args.in_cols << "\n"; + // NCHW + // std::cout <<"Out N: " << static_cast(output->dims()[0]) + // << " C: " << args.out_depth + // << " H: " << args.out_rows << " W: " << args.out_cols << "\n"; + // MCHW = OIHW + // std::cout <<"Filter O: " << static_cast(filter->dims()[0]) + // << " I: " << static_cast(filter->dims()[1]) + // << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n"; + + // strides(h_stride, w_stride) + args.stride_rows = strides[0]; + args.stride_cols = strides[1]; + // std::cout <<"PoolingType: " << args.pool_type << "\n"; + // std::cout <<"Stride H: " << args.stride_rows << " W: " << + // args.stride_cols << "\n"; + + // paddings(h_pad, w_pad) + args.pad_rows = paddings[0]; + args.pad_cols = paddings[1]; + // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols << + // "\n"; + + args.out_depth = args.in_depth; + // args.out_rows = out->dims()[2]; + // args.out_cols = out->dims()[3]; + args.out_rows = static_cast(ceil(static_cast(args.in_rows + + 2 * args.pad_rows - + args.filter_rows) / + args.stride_rows)) + + 1; + args.out_cols = static_cast(ceil(static_cast(args.in_cols + + 2 * args.pad_cols - + args.filter_cols) / + args.stride_cols)) + + 1; + + if (is_global_pooling) { + args.filter_rows = args.in_rows; + args.filter_cols = args.in_cols; + args.pad_rows = 0; + args.pad_cols = 0; + } + } + acl::AclParameters args; +}; + +template <> +bool PoolKernel::Init(const PoolParam& param) const { + AclPoolOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclPoolOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } + return true; +} + +template <> +void PoolKernel::Compute(const PoolParam& param) const { + std::cout << "init acl" << std::endl; + AclPoolOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + const float* input_data = (const float*)args.input_data; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + for (int n = 0; n < args.batch; ++n) { + acl_op->RunAcl((void*)input_data, (void*)output_data); + input_data += args.in_depth * args.in_cols * args.in_rows; + output_data += args.in_depth * args.out_cols * args.out_rows; + } +} + +template class PoolKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif +#endif diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3deebc9d2f1a9f652813362f4947f744f0541482 --- /dev/null +++ b/src/operators/kernel/mali/relu_kernel.cpp @@ -0,0 +1,136 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RELU_OP + +#pragma once + +#include "operators/kernel/relu_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class AclReluOp : public acl::ACLOperator { + public: + AclReluOp() { + this->force_bypass_acl_path_ = + bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU; + } + ~AclReluOp() = default; + AclReluOp(const AclReluOp&) = delete; + AclReluOp& operator=(const AclReluOp&) = delete; + AclReluOp(AclReluOp&&) = delete; + AclReluOp& operator=(AclReluOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + void InitAclLayer(const ReluParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + arm_compute::TensorShape input_shape(args.in_cols * args.in_rows * + args.in_depth * args.batch); + arm_compute::TensorShape output_shape(args.in_cols * args.in_rows * + args.in_depth * args.out_num); + // arm_compute::TensorShape weights_shape( + // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth); + // arm_compute::TensorShape biases_shape(args.out_depth); + arm_compute::ActivationLayerInfo::ActivationFunction type; + type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU; + + arm_compute::ActivationLayerInfo act_info(type); + + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + + //[width, height, IFM] + new_tensor(input(), input_shape, args.input_data); + //[width, height, OFM] + new_tensor(output(), output_shape, args.output_data); + + acl_configure(activation, this, act_info); + } + + void RunAcl(void* input, void* output) { + acl::ACLOperator::acl_run(input, output); + } + bool Bypass_acl(const ReluParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_) { + bypass_acl = true; + } + return bypass_acl; + } + + private: + void AclParametersByContext(const ReluParam& param) { + const auto* input_x = param.InputX(); + auto* out = param.Out(); + + const T* input_data = input_x->data(); + T* output_data = out->mutable_data(); + + args.input_data = (void*)input_data; + args.output_data = (void*)output_data; + + args.batch = input_x->dims()[0]; + args.in_depth = input_x->dims()[1]; + args.in_rows = input_x->dims()[2]; + args.in_cols = input_x->dims()[3]; + args.out_num = out->dims()[0]; + } + acl::AclParameters args; +}; + +template <> +bool ReluKernel::Init(const ReluParam& param) const { + AclReluOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclReluOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } + return true; +} + +template <> +void ReluKernel::Compute(const ReluParam& param) const { + std::cout << "init acl" << std::endl; + AclReluOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + const float* input_data = (const float*)args.input_data; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + acl_op->RunAcl((void*)input_data, (void*)output_data); +} + +template class ReluKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif +#endif diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7521454d46dfc82064930971d2b996b542af54a --- /dev/null +++ b/src/operators/kernel/mali/reshape_kernel.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE_OP + +#pragma once + +#include "operators/kernel/reshape_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReshapeKernel::Init(const ReshapeParam ¶) const { + return true; +} + +template <> +void ReshapeKernel::Compute(const ReshapeParam ¶m) const { + const auto *input_x = param.InputX(); + const auto &input_x_dims = input_x->dims(); + auto *out = param.Out(); + framework::DDim out_dims = out->dims(); + const auto *input_shape = param.InputShape(); + + if (input_shape) { + auto *shape_data = input_shape->data(); + framework::Tensor cpu_shape_tensor; + auto shape = + std::vector(shape_data, shape_data + input_shape->numel()); + out_dims = ValidateShape(shape, input_x->dims()); + } + + bool inplace = param.Inplace(); + out->Resize(out_dims); + if (!inplace) { + out->mutable_data(); + framework::TensorCopy(*input_x, out); + out->Resize(out_dims); + } else { + out->ShareDataWith(*input_x); + out->Resize(out_dims); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..36edb3724600ada43606c23b1989615183ff21e8 --- /dev/null +++ b/src/operators/kernel/mali/softmax_kernel.cpp @@ -0,0 +1,137 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SOFTMAX_OP + +#pragma once + +#include "operators/kernel/softmax_kernel.h" +#ifdef PADDLE_MOBILE_MALI_GPU +#include "acl_operator.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class AclSoftmaxOp : public acl::ACLOperator { + public: + AclSoftmaxOp() { + this->force_bypass_acl_path_ = + bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX; + } + ~AclSoftmaxOp() = default; + AclSoftmaxOp(const AclSoftmaxOp&) = delete; + AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete; + AclSoftmaxOp(AclSoftmaxOp&&) = delete; + AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete; + + acl::AclParameters& getargs() { return args; } + void InitAclLayer(const SoftmaxParam& param) { + setTargetHint(acl::TargetHint::OPENCL); + arm_compute::TensorShape shape(args.in_depth, args.batch); + + if (is_operator_init_done(shape)) return; + set_operator_init_done(); + this->force_bypass_acl_path_ = false; + + //[width, height, IFM] + new_tensor(input(), shape, args.input_data); + //[width, height, OFM] + new_tensor(output(), shape, args.output_data); + + acl_configure(softmax, this, NULL); + } + + void RunAcl(void* input, void* output) { + acl::ACLOperator::acl_run(input, output); + } + bool Bypass_acl(const SoftmaxParam& param) { + bool bypass_acl = false; + AclParametersByContext(param); + // for performance, more groups impact GPU performance + if (this->force_bypass_acl_path_) { + bypass_acl = true; + } + + return bypass_acl; + } + + private: + void AclParametersByContext(const SoftmaxParam& param) { + const framework::Tensor* in_x = param.InputX(); + framework::Tensor* out = param.Out(); + auto x_dims = in_x->dims(); + out->Resize(x_dims); + + const T* input_data = in_x->data(); + T* output_data = out->data(); + + args.input_data = (void*)input_data; + args.output_data = (void*)output_data; + + // NCHW + args.batch = in_x->dims()[0]; + args.in_depth = in_x->dims()[1]; + + args.out_num = out->dims()[0]; + + // std::cout + // << "Out C: " << args.out_depth + // << " H: " << args.out_rows << " W: " << args.out_cols << "\n"; + } + acl::AclParameters args; +}; + +template <> +bool SoftmaxKernel::Init(const SoftmaxParam& param) const { + AclSoftmaxOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + acl_op = new AclSoftmaxOp(); + this->SetAclOp((void*)acl_op, (void*)this); + } + return true; +} + +template <> +void SoftmaxKernel::Compute(const SoftmaxParam& param) const { + std::cout << "init acl" << std::endl; + AclSoftmaxOp* acl_op = + reinterpret_cast*>(this->GetAclOp()); + if (acl_op == nullptr) { + return; + } + if (acl_op->Bypass_acl(param)) { + std::cout << "init acl failed" << std::endl; + return; + } + acl::AclParameters& args = acl_op->getargs(); + const float* input_data = (const float*)args.input_data; + const float* output_data = (const float*)args.output_data; + acl_op->InitAclLayer(param); + for (int n = 0; n < args.out_num; ++n) { + acl_op->RunAcl((void*)input_data, (void*)output_data); + input_data += args.in_depth; + output_data += args.in_depth; + } +} + +template class SoftmaxKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif +#endif diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp index 2533ab19a5084513a991082f148d546cb0059657..dc43cb022ac9d7435654cbc565c81c57ba80b350 100644 --- a/src/operators/lrn_op.cpp +++ b/src/operators/lrn_op.cpp @@ -34,6 +34,8 @@ USE_OP_CPU(lrn); REGISTER_OPERATOR_CPU(lrn, ops::LrnOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(lrn); +REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/math/depthwiseconv3x3s1p1.cpp b/src/operators/math/depthwiseconv3x3s1p1.cpp new file mode 100644 index 0000000000000000000000000000000000000000..88cac515201c114e83cb9e85b39a51fb3f8e7955 --- /dev/null +++ b/src/operators/math/depthwiseconv3x3s1p1.cpp @@ -0,0 +1,288 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/math/depthwiseconv3x3s1p1.h" +#include + +namespace paddle_mobile { +namespace operators { +namespace math { + +using framework::Tensor; + +void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output, + Tensor bias, bool if_bias) { + const float *input_data = input->data(); + const float *filter_data = filter.data(); + float *output_data = output->data(); + const float *bias_data = bias.data(); + + const int h = static_cast(input->dims()[2]); + const int w = static_cast(input->dims()[3]); + const int l = h; + + const int batch_size = static_cast(input->dims()[0]); + const int c = static_cast(input->dims()[1]); + const int hxw = h * w; + float32x4_t vbias = vdupq_n_f32(0.0); + for (int b = 0; b < batch_size; ++b) { + const float *filter_data_tmp = filter_data; + + for (int j = 0; j < c; ++j) { + if (if_bias) { + vbias = vdupq_n_f32(bias_data[j]); + } + + int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0 + float w00 = filter_data_tmp[0]; + float w01 = filter_data_tmp[1]; + float w02 = filter_data_tmp[2]; + float w10 = filter_data_tmp[3]; + float w11 = filter_data_tmp[4]; + float w12 = filter_data_tmp[5]; + float w20 = filter_data_tmp[6]; + float w21 = filter_data_tmp[7]; + float w22 = filter_data_tmp[8]; + + output_data[0] = w11 * input_data[0] + w12 * input_data[1] + + w21 * input_data[l] + w22 * input_data[l + 1] + + bias_data[j]; + output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] + + w20 * input_data[2 * l - 2] + + w21 * input_data[2 * l - 1] + bias_data[j]; + output_data[(l - 1) * l] = + w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] + + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] + + bias_data[j]; + output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + + w01 * input_data[(l - 2) * (l + 1) + 1] + + w10 * input_data[l * l - 2] + + w11 * input_data[l * l - 1] + bias_data[j]; + + for (int i = 1; i < l - 1; ++i) { + output_data[i * l] = + w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] + + w11 * input_data[i * l] + w12 * input_data[i * l + 1] + + w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] + + bias_data[j]; + output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + + w01 * input_data[i * l + l - 1 - l] + + w10 * input_data[i * l + l - 1 - 1] + + w11 * input_data[i * l + l - 1] + + w20 * input_data[i * l + l - 1 + l - 1] + + w21 * input_data[i * l + l - 1 + l] + + bias_data[j]; + } + + // top 1 row and bottom 1 row + const float *input_tmp = input_data; + + float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, + tmp3, tmp4, tmp5, out0; + in0 = vld1q_f32(input_tmp); + in2 = vld1q_f32(input_tmp + l); + const float *input_tmp_end = input_tmp + (l - 2) * l; + in4 = vld1q_f32(input_tmp_end); + in6 = vld1q_f32(input_tmp_end + l); + int c_mid = l_mid; + auto output_ptr = output_data + 1; + for (; c_mid > 3; c_mid -= 4) { + in1 = vld1q_f32(input_tmp + 4); + in3 = vld1q_f32(input_tmp + l + 4); + + tmp0 = vextq_f32(in0, in1, 1); + tmp1 = vextq_f32(in0, in1, 2); + + tmp2 = vextq_f32(in2, in3, 1); + tmp3 = vextq_f32(in2, in3, 2); + + out0 = vmulq_n_f32(in0, w10); + out0 = vmlaq_n_f32(out0, tmp0, w11); + out0 = vmlaq_n_f32(out0, tmp1, w12); + out0 = vmlaq_n_f32(out0, in2, w20); + out0 = vmlaq_n_f32(out0, tmp2, w21); + out0 = vmlaq_n_f32(out0, tmp3, w22); + out0 = vaddq_f32(out0, vbias); + + vst1q_f32(output_ptr, out0); + + in5 = vld1q_f32(input_tmp_end + 4); + in7 = vld1q_f32(input_tmp_end + l + 4); + + tmp0 = vextq_f32(in4, in5, 1); + tmp1 = vextq_f32(in4, in5, 2); + tmp2 = vextq_f32(in6, in7, 1); + tmp3 = vextq_f32(in6, in7, 2); + + out0 = vmulq_n_f32(in4, w00); + out0 = vmlaq_n_f32(out0, tmp0, w01); + out0 = vmlaq_n_f32(out0, tmp1, w02); + out0 = vmlaq_n_f32(out0, in6, w10); + out0 = vmlaq_n_f32(out0, tmp2, w11); + out0 = vmlaq_n_f32(out0, tmp3, w12); + out0 = vaddq_f32(out0, vbias); + + vst1q_f32(output_ptr + (l - 1) * l, out0); + + // can optimize to each 8 stride. + input_tmp += 4; + input_tmp_end += 4; + output_ptr += 4; + in0 = in1; + in2 = in3; + in4 = in5; + in6 = in7; + } + + // top right pad + float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]); + + tmp0 = vextq_f32(in0, pad0, 1); + tmp1 = vextq_f32(in0, pad0, 2); + tmp2 = vextq_f32(in2, pad1, 1); + tmp3 = vextq_f32(in2, pad1, 2); + + out0 = vmulq_n_f32(in0, w10); + out0 = vmlaq_n_f32(out0, tmp0, w11); + out0 = vmlaq_n_f32(out0, tmp1, w12); + out0 = vmlaq_n_f32(out0, in2, w20); + out0 = vmlaq_n_f32(out0, tmp2, w21); + out0 = vmlaq_n_f32(out0, tmp3, w22); + out0 = vaddq_f32(out0, vbias); + + for (int i = 0; i < c_mid; ++i) { + if (i == 0) { + vst1q_lane_f32(output_ptr + i, out0, 0); + } + if (i == 1) { + vst1q_lane_f32(output_ptr + i, out0, 1); + } + if (i == 2) { + vst1q_lane_f32(output_ptr + i, out0, 2); + } + } + + // bottom right pad + float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]); + float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]); + + tmp0 = vextq_f32(in4, pad2, 1); + tmp1 = vextq_f32(in4, pad2, 2); + tmp2 = vextq_f32(in6, pad3, 1); + tmp3 = vextq_f32(in6, pad3, 2); + + out0 = vmulq_n_f32(in4, w00); + out0 = vmlaq_n_f32(out0, tmp0, w01); + out0 = vmlaq_n_f32(out0, tmp1, w02); + out0 = vmlaq_n_f32(out0, in6, w10); + out0 = vmlaq_n_f32(out0, tmp2, w11); + out0 = vmlaq_n_f32(out0, tmp3, w12); + out0 = vaddq_f32(out0, vbias); + + for (int i = 0; i < c_mid; ++i) { + if (i == 0) { + vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0); + } + if (i == 1) { + vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1); + } + if (i == 2) { + vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2); + } + } + // mid + + for (int i = 0; i < l - 2; ++i) { + auto output_ptr = output_data + (i + 1) * l + 1; + input_tmp = input_data + i * l; + auto in0_tmp = vld1q_f32(input_tmp); + auto in2_tmp = vld1q_f32(input_tmp + l); + auto in4_tmp = vld1q_f32(input_tmp + l + l); + c_mid = l_mid; + for (; c_mid > 3; c_mid -= 4) { + auto in1_tmp = vld1q_f32(input_tmp + 4); + auto in3_tmp = vld1q_f32(input_tmp + l + 4); + auto in5_tmp = vld1q_f32(input_tmp + l + l + 4); + + tmp0 = vextq_f32(in0_tmp, in1_tmp, 1); + tmp1 = vextq_f32(in0_tmp, in1_tmp, 2); + tmp2 = vextq_f32(in2_tmp, in3_tmp, 1); + tmp3 = vextq_f32(in2_tmp, in3_tmp, 2); + tmp4 = vextq_f32(in4_tmp, in5_tmp, 1); + tmp5 = vextq_f32(in4_tmp, in5_tmp, 2); + + out0 = vmulq_n_f32(in0_tmp, w00); + out0 = vmlaq_n_f32(out0, tmp0, w01); + out0 = vmlaq_n_f32(out0, tmp1, w02); + out0 = vmlaq_n_f32(out0, in2_tmp, w10); + out0 = vmlaq_n_f32(out0, tmp2, w11); + out0 = vmlaq_n_f32(out0, tmp3, w12); + out0 = vmlaq_n_f32(out0, in4_tmp, w20); + out0 = vmlaq_n_f32(out0, tmp4, w21); + out0 = vmlaq_n_f32(out0, tmp5, w22); + out0 = vaddq_f32(out0, vbias); + + vst1q_f32(output_ptr, out0); + + output_ptr += 4; + input_tmp += 4; + in0_tmp = in1_tmp; + in2_tmp = in3_tmp; + in4_tmp = in5_tmp; + } + + float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]); + float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]); + + tmp0 = vextq_f32(in0_tmp, pad0, 1); + tmp1 = vextq_f32(in0_tmp, pad0, 2); + tmp2 = vextq_f32(in2_tmp, pad1, 1); + tmp3 = vextq_f32(in2_tmp, pad1, 2); + tmp4 = vextq_f32(in4_tmp, pad2, 1); + tmp5 = vextq_f32(in4_tmp, pad2, 2); + + out0 = vmulq_n_f32(in0_tmp, w00); + out0 = vmlaq_n_f32(out0, tmp0, w01); + out0 = vmlaq_n_f32(out0, tmp1, w02); + out0 = vmlaq_n_f32(out0, in2_tmp, w10); + out0 = vmlaq_n_f32(out0, tmp2, w11); + out0 = vmlaq_n_f32(out0, tmp3, w12); + out0 = vmlaq_n_f32(out0, in4_tmp, w20); + out0 = vmlaq_n_f32(out0, tmp4, w21); + out0 = vmlaq_n_f32(out0, tmp5, w22); + out0 = vaddq_f32(out0, vbias); + + for (int i = 0; i < c_mid; ++i) { + if (i == 0) { + vst1q_lane_f32(output_ptr + i, out0, 0); + } + if (i == 1) { + vst1q_lane_f32(output_ptr + i, out0, 1); + } + if (i == 2) { + vst1q_lane_f32(output_ptr + i, out0, 2); + } + } + } + output_data += hxw; + input_data += hxw; + filter_data_tmp += 9; + } + } +} +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/math/depthwiseconv3x3s1p1.h b/src/operators/math/depthwiseconv3x3s1p1.h new file mode 100644 index 0000000000000000000000000000000000000000..019237a43192f30dfb70fe85e6b16a835cba4eba --- /dev/null +++ b/src/operators/math/depthwiseconv3x3s1p1.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "framework/tensor.h" + +namespace paddle_mobile { +namespace operators { +namespace math { +using framework::Tensor; + +void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output, + Tensor bias, bool if_bias); +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp index d97c6ec3e470bb2b083ef7e5234168c6fdfc34c1..49ae3a5e8484cb2f6628eb53cabd9321ae5705b8 100644 --- a/src/operators/mul_op.cpp +++ b/src/operators/mul_op.cpp @@ -60,6 +60,8 @@ USE_OP_CPU(mul); REGISTER_OPERATOR_CPU(mul, ops::MulOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(mul); +REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp index e8a469d43141f0b880605b52216094c292ca50fb..62eaf6b5f8105c4d2ab63f2f883445705b815860 100644 --- a/src/operators/pool_op.cpp +++ b/src/operators/pool_op.cpp @@ -64,6 +64,8 @@ USE_OP_CPU(pool2d); REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(pool2d); +REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp index cf495d8bdace83f5dd7f86d372d07b3241867af9..877dcee1a7f4a5a75d013031235d3a216c35f854 100644 --- a/src/operators/relu_op.cpp +++ b/src/operators/relu_op.cpp @@ -38,6 +38,8 @@ USE_OP_CPU(relu); REGISTER_OPERATOR_CPU(relu, ops::ReluOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(relu); +REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp index 0fdcaf4d1a95ccd2a0ceccdc6d890b30a1d66368..c7294079b26250770006aeb1b79c15469489b988 100644 --- a/src/operators/reshape_op.cpp +++ b/src/operators/reshape_op.cpp @@ -37,6 +37,8 @@ USE_OP_CPU(reshape); REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(reshape); +REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp index e25b59198f3206357a770a104080f99bafa84dc5..296e3ef30f7c0260cca169bcfe2f6b445493792a 100644 --- a/src/operators/softmax_op.cpp +++ b/src/operators/softmax_op.cpp @@ -32,6 +32,8 @@ USE_OP_CPU(softmax); REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU +USE_OP_MALI_GPU(softmax); +REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/tools/push2android.sh b/tools/android-debug-script/push2android.sh similarity index 59% rename from tools/push2android.sh rename to tools/android-debug-script/push2android.sh index d7d1ad9950d58f415804834b8ebc0740a3e796cb..fae1a856123bd16cf3f7a115f61b3e4473ff58a3 100644 --- a/tools/push2android.sh +++ b/tools/android-debug-script/push2android.sh @@ -1,10 +1,10 @@ #!/usr/bin/env sh push_fn () { -MODELS_PATH="../test/models/*" -MODELS_SRC="../test/models" -IMAGE_PATH="../test/images/*" -EXE_FILE="../test/build/*" +MODELS_PATH="../../test/models/*" +MODELS_SRC="../../test/models" +IMAGE_PATH="../../test/images/*" +EXE_FILE="../../test/build/*" EXE_DIR="data/local/tmp/bin" adb shell mkdir ${EXE_DIR} MODELS_DIR="data/local/tmp/models" @@ -14,9 +14,14 @@ do adb shell mkdir ${MODELS_DIR}"/"${file} done +if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then +ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*" +adb push ${ACL_BUILD_PATH} ${EXE_DIR} +fi + IMAGES_DIR="data/local/tmp/images" adb shell mkdir ${IMAGES_DIR} -LIB_PATH="../build/release/arm-v7a/build/*" +LIB_PATH="../../build/release/arm-v7a/build/*" adb push ${EXE_FILE} ${EXE_DIR} adb push ${LIB_PATH} ${EXE_DIR} if [[ $1 != "npm" ]]; then diff --git a/tools/scripts/run_on_android.sh b/tools/android-debug-script/run_on_android.sh similarity index 100% rename from tools/scripts/run_on_android.sh rename to tools/android-debug-script/run_on_android.sh diff --git a/tools/build.sh b/tools/build.sh index 4ac63315a94798d3aca63fb62aef511c4146cd3c..42e872c580cffef3bd904dc9cc575e9961ef4257 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -56,7 +56,7 @@ build_for_android() { MODE="Release" - ANDROID_PLATFORM_VERSION="android-15" + ANDROID_PLATFORM_VERSION="android-22" TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" ANDROID_ARM_MODE="arm" if [ $# -eq 1 ]; then diff --git a/tools/run.sh b/tools/run.sh deleted file mode 100644 index aaf0f52f0335d6e73060ed9b8e86a78ba357c552..0000000000000000000000000000000000000000 --- a/tools/run.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env sh -# auto build and run - -BUILDNET="mobilenetssd" -TESTUNIT="test-mobilenetssd" - -push_fn () { -sh build.sh android ${BUILDNET} -MODELS_PATH="../test/models/*" -MODELS_SRC="../test/models" -IMAGE_PATH="../test/images/*" -EXE_FILE="../test/build/*" -EXE_DIR="data/local/tmp/bin" -adb shell mkdir ${EXE_DIR} -MODELS_DIR="data/local/tmp/models" -adb shell mkdir ${MODELS_DIR} -for file in `ls ${MODELS_SRC}` -do - adb shell mkdir ${MODELS_DIR}"/"${file} -done - -IMAGES_DIR="data/local/tmp/images" -adb shell mkdir ${IMAGES_DIR} -LIB_PATH="../build/release/arm-v7a/build/*" -adb push ${EXE_FILE} ${EXE_DIR} -adb push ${LIB_PATH} ${EXE_DIR} -if [[ $1 != "npm" ]]; then -adb push ${IMAGE_PATH} ${IMAGES_DIR} -adb push ${MODELS_PATH} ${MODELS_DIR} -fi -adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${TESTUNIT}" -} - -if [[ $1 == "npm" ]]; then -push_fn $1 -else -push_fn -fi