提交 a7955b2a 编写于 作者: D dolphin8

Merge branch 'develop' of github.com:dolphin8/paddle-mobile into develop

[submodule "src/operators/kernel/mali/ACL_Android"]
path = src/operators/kernel/mali/ACL_Android
url = https://github.com/halsay/ACL_Android.git
...@@ -7,7 +7,7 @@ option(USE_EXCEPTION "use std exception" ON) ...@@ -7,7 +7,7 @@ option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON) option(LOG_PROFILE "log profile" ON)
# select the platform to build # select the platform to build
option(CPU "armv7 with neon" ON) option(CPU "armv7 with neon" ON)
option(MALI_GPU "mali gpu" OFF) option(MALI_GPU "mali gpu" ON)
option(FPGA "fpga" OFF) option(FPGA "fpga" OFF)
set(DEBUGING ON) set(DEBUGING ON)
if (CPU) if (CPU)
...@@ -15,7 +15,18 @@ if (CPU) ...@@ -15,7 +15,18 @@ if (CPU)
endif() endif()
if (MALI_GPU) if (MALI_GPU)
add_definitions(-DPADDLE_MOBILE_MALI_GPU) add_definitions(-DPADDLE_MOBILE_MALI_GPU)
add_definitions(-DUSE_ACL=1)
add_definitions(-DUSE_OPENCL)
set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
endif() endif()
if(FPGA) if(FPGA)
......
...@@ -138,9 +138,21 @@ class OpKernelBase { ...@@ -138,9 +138,21 @@ class OpKernelBase {
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体, * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h * 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */ * */
#ifdef PADDLE_MOBILE_MALI_GPU
OpKernelBase() { acl_op_ = nullptr; }
void *GetAclOp() const { return acl_op_; }
void SetAclOp(void *op, void *ob) const {
reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
}
#endif
virtual void Compute(const P &para) const = 0; virtual void Compute(const P &para) const = 0;
virtual bool Init(const P &para) const { return true; }; virtual bool Init(const P &para) const { return true; };
virtual ~OpKernelBase() = default; virtual ~OpKernelBase() = default;
private:
#ifdef PADDLE_MOBILE_MALI_GPU
void *acl_op_;
#endif
}; };
#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \ #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
......
...@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm); ...@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp); REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(batch_norm);
REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -67,6 +67,8 @@ USE_OP_CPU(concat); ...@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
REGISTER_OPERATOR_CPU(concat, ops::ConcatOp); REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(concat);
REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add); ...@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp); REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(elementwise_add);
REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -50,6 +50,8 @@ USE_OP_CPU(feed); ...@@ -50,6 +50,8 @@ USE_OP_CPU(feed);
REGISTER_OPERATOR_CPU(feed, ops::FeedOp); REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(feed);
REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -50,6 +50,8 @@ USE_OP_CPU(fetch); ...@@ -50,6 +50,8 @@ USE_OP_CPU(fetch);
REGISTER_OPERATOR_CPU(fetch, ops::FetchOp); REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fetch);
REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add); ...@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add);
REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp); REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv_add);
REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -68,11 +68,23 @@ class FusionConvAddOp : public framework::OperatorWithKernel< ...@@ -68,11 +68,23 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
}; };
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar( static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher()); new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif #endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -64,8 +64,13 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel< ...@@ -64,8 +64,13 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
}; };
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_RELU_REGISTER
#define CONV_ADD_RELU_REGISTER
// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new // static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
// FusionConvAddReluOpMatcher()); // FusionConvAddReluOpMatcher());
#endif
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
#endif #endif
......
...@@ -59,6 +59,8 @@ USE_OP_CPU(fc); ...@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp); REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fc);
REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -66,11 +66,19 @@ class FusionFcOp ...@@ -66,11 +66,19 @@ class FusionFcOp
}; };
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
#ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#endif #endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
// static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); #ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#endif #endif
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef BATCHNORM_OP #ifdef BATCHNORM_OP
#include "operators/kernel/batchnorm_kernel.h" #include "operators/kernel/batchnorm_kernel.h"
#include <cmath> #include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -27,215 +27,7 @@ bool BatchNormKernel<CPU, float>::Init(const BatchNormParam &para) const { ...@@ -27,215 +27,7 @@ bool BatchNormKernel<CPU, float>::Init(const BatchNormParam &para) const {
template <> template <>
void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
const Tensor *input_x = param.InputX(); BatchnormCompute<float>(param);
auto input_x_ptr = input_x->data<float>();
const auto &x_dims = input_x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int H = x_dims[2];
const int W = x_dims[3];
const int stride0 = C * H * W;
const int stride1 = H * W;
const int stride2 = W;
Tensor *out = param.OutputY();
auto out_ptr = out->mutable_data<float>();
const float epsilon = param.Epsilon();
const Tensor *mean = param.InputMean();
const Tensor *variance = param.InputVariance();
const Tensor *scale = param.InputScale();
const Tensor *bias = param.InputBias();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE(C == variance->numel(),
"C must equal to variance.numel()");
int HXW = H * W;
if (HXW > 32) {
int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
float *volatile new_scale_ptr = new float[NXC * 4];
float *volatile new_bias_ptr = new float[NXC * 4];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for (int i = 0; i < C * 4; i += 4) {
int index = i / 4;
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
inv_std_ptr[i + 1] = inv_std_ptr[i];
inv_std_ptr[i + 2] = inv_std_ptr[i];
inv_std_ptr[i + 3] = inv_std_ptr[i];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
new_scale_ptr[i + 1] = new_scale_ptr[i];
new_scale_ptr[i + 2] = new_scale_ptr[i];
new_scale_ptr[i + 3] = new_scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
new_bias_ptr[i + 1] = new_bias_ptr[i];
new_bias_ptr[i + 2] = new_bias_ptr[i];
new_bias_ptr[i + 3] = new_bias_ptr[i];
}
for (int j = C * 4; j < NXC * 4; ++j) {
new_scale_ptr[j] = new_scale_ptr[j - C * 4];
new_bias_ptr[j] = new_bias_ptr[j - C * 4];
}
asm volatile(
"subs %[N], %[N], #1 \n\t"
"blt end_n_%= \n\t"
"loop_n_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"blt end_c_%= \n\t"
"loop_c_%=: \n\t"
"vld1.32 {q9}, [%[new_scale_ptr]]! \n\t"
"vld1.32 {q10}, [%[new_bias_ptr]]! \n\t"
"mov r6, %[HXW] \n\t"
"subs r6, r6, #32 \n\t"
"blt end_hw_%= \n\t"
"loop_hw_%=: \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"subs r6, r6, #32 \n\t"
"bge loop_hw_%= \n\t"
"end_hw_%=: \n\t"
"cmp r6, #0 \n\t"
"bge end_remainder_%= \n\t"
"mov r5, #4 \n\t"
"mul r6, r6, r5 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r6 \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"add %[out_ptr], %[out_ptr], r6 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"end_remainder_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"bge loop_c_%= \n\t"
"end_c_%=: \n\t"
"subs %[N], %[N], #1 \n\t"
"bge loop_n_%= \n\t"
"end_n_%=: \n\t"
:
: [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
[new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
[N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "r5", "r6");
delete[] inv_std_ptr;
delete[] new_scale_ptr;
delete[] new_bias_ptr;
} else {
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
}
}
}
delete[] inv_std_ptr;
// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
// DLOG << "input_x_ptr : " << input_x_ptr[102];
// DLOG << "variance : " << variance_ptr[5];
// DLOG << "inv_std_ptr : " << inv_std_ptr[5];
// DLOG << "new_scale_ptr : " << new_scale_ptr[5];
// DLOG << "new_bias_ptr : " << new_bias_ptr[5];
// DLOG << "out_ptr : " << out_ptr[102];
}
} }
} // namespace operators } // namespace operators
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP #ifdef FUSION_CONVADD_RELU_OP
#include "operators/kernel/conv_add_relu_kernel.h" #include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -28,92 +29,7 @@ bool ConvAddReluKernel<CPU, float>::Init( ...@@ -28,92 +29,7 @@ bool ConvAddReluKernel<CPU, float>::Init(
template <> template <>
void ConvAddReluKernel<CPU, float>::Compute( void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam &param) const { const FusionConvAddReluParam &param) const {
const Tensor *input = param.Input(); ConvAddReluCompute<float>(param);
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int axis = param.Axis();
Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
output->ShareDataWith(bias);
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true);
}
}
} }
template class ConvAddReluKernel<CPU, float>; template class ConvAddReluKernel<CPU, float>;
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef CONV_OP #ifdef CONV_OP
#include "operators/kernel/conv_kernel.h" #include "operators/kernel/conv_kernel.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -26,88 +27,7 @@ bool ConvKernel<CPU, float>::Init(const ConvParam &para) const { ...@@ -26,88 +27,7 @@ bool ConvKernel<CPU, float>::Init(const ConvParam &para) const {
template <> template <>
void ConvKernel<CPU, float>::Compute(const ConvParam &param) const { void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
const Tensor *input = param.Input(); ConvCompute<float>(param);
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
} }
template class ConvKernel<CPU, float>; template class ConvKernel<CPU, float>;
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef DEPTHWISECONV_OP #ifdef DEPTHWISECONV_OP
#include "operators/kernel/depthwise_conv_kernel.h" #include "operators/kernel/depthwise_conv_kernel.h"
#include "operators/kernel/conv_kernel.h" #include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -27,91 +27,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const { ...@@ -27,91 +27,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const {
template <> template <>
void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const { void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
LOG(kLOG_DEBUG) << param; DepthwiseConvCompute<float>(param);
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
// DLOG << " compute end get Attrs " << strides[0];
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
} }
template class DepthwiseConvKernel<CPU, float>; template class DepthwiseConvKernel<CPU, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef BATCHNORM_OP
#pragma once
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void BatchnormCompute(const BatchNormParam &param) {
const Tensor *input_x = param.InputX();
auto input_x_ptr = input_x->data<float>();
const auto &x_dims = input_x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int H = x_dims[2];
const int W = x_dims[3];
const int stride0 = C * H * W;
const int stride1 = H * W;
const int stride2 = W;
Tensor *out = param.OutputY();
auto out_ptr = out->mutable_data<float>();
const float epsilon = param.Epsilon();
const Tensor *mean = param.InputMean();
const Tensor *variance = param.InputVariance();
const Tensor *scale = param.InputScale();
const Tensor *bias = param.InputBias();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE(C == variance->numel(),
"C must equal to variance.numel()");
int HXW = H * W;
if (HXW > 32) {
int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
float *volatile new_scale_ptr = new float[NXC * 4];
float *volatile new_bias_ptr = new float[NXC * 4];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for (int i = 0; i < C * 4; i += 4) {
int index = i / 4;
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
inv_std_ptr[i + 1] = inv_std_ptr[i];
inv_std_ptr[i + 2] = inv_std_ptr[i];
inv_std_ptr[i + 3] = inv_std_ptr[i];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
new_scale_ptr[i + 1] = new_scale_ptr[i];
new_scale_ptr[i + 2] = new_scale_ptr[i];
new_scale_ptr[i + 3] = new_scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
new_bias_ptr[i + 1] = new_bias_ptr[i];
new_bias_ptr[i + 2] = new_bias_ptr[i];
new_bias_ptr[i + 3] = new_bias_ptr[i];
}
for (int j = C * 4; j < NXC * 4; ++j) {
new_scale_ptr[j] = new_scale_ptr[j - C * 4];
new_bias_ptr[j] = new_bias_ptr[j - C * 4];
}
asm volatile(
"subs %[N], %[N], #1 \n\t"
"blt end_n_%= \n\t"
"loop_n_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"blt end_c_%= \n\t"
"loop_c_%=: \n\t"
"vld1.32 {q9}, [%[new_scale_ptr]]! \n\t"
"vld1.32 {q10}, [%[new_bias_ptr]]! \n\t"
"mov r6, %[HXW] \n\t"
"subs r6, r6, #32 \n\t"
"blt end_hw_%= \n\t"
"loop_hw_%=: \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"subs r6, r6, #32 \n\t"
"bge loop_hw_%= \n\t"
"end_hw_%=: \n\t"
"cmp r6, #0 \n\t"
"bge end_remainder_%= \n\t"
"mov r5, #4 \n\t"
"mul r6, r6, r5 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r6 \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"add %[out_ptr], %[out_ptr], r6 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"end_remainder_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"bge loop_c_%= \n\t"
"end_c_%=: \n\t"
"subs %[N], %[N], #1 \n\t"
"bge loop_n_%= \n\t"
"end_n_%=: \n\t"
:
: [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
[new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
[N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "r5", "r6");
delete[] inv_std_ptr;
delete[] new_scale_ptr;
delete[] new_bias_ptr;
} else {
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr =
new_scale.mutable_data<float>(framework::make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
}
}
}
delete[] inv_std_ptr;
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#pragma once
#include <vector>
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void ConvAddReluCompute(const FusionConvAddReluParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int axis = param.Axis();
Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
output->ShareDataWith(bias);
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true);
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void ConvCompute(const ConvParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEPTHWISECONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void DepthwiseConvCompute(const ConvParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
// DLOG << " compute end get Attrs " << strides[0];
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -35,21 +35,6 @@ class ConvKernel : public OpKernelBase<DeviceType, ConvParam> { ...@@ -35,21 +35,6 @@ class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
bool Init(const ConvParam &para) const; bool Init(const ConvParam &para) const;
}; };
inline bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides,
const std::vector<int> &paddings,
const std::vector<int> &dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if USE_ACL == 1
#include "acl_operator.h"
unsigned int bypass_acl_class_layer =
(0 | FLAGS_ENABLE_ACL_CONCAT |
/*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
|*/
0);
int enable_schedule = 0;
#ifdef USE_PROFILING
#include "arm_neon.h"
unsigned int acl_log_flags =
(0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */ /*MASK_LOG_ALLOCATE | */
/*MASK_LOG_RUN | */ /*MASK_LOG_CONFIG | */ /*MASK_LOG_COPY | */
MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
MASK_LOG_CONCAT | 0);
#include <stdio.h> /* printf */
#include <stdlib.h> /* getenv */
#endif // USE_PROFILING
static bool force_enable_gpu = false;
bool AclEnableSchedule(int enable) {
enable_schedule = enable;
if (enable) {
force_enable_gpu = true;
}
return true;
}
int isScheduleEnable() { return enable_schedule; }
namespace paddle_mobile {
namespace operators {
namespace acl {
bool ACLOperator::init_gpu_env = true;
#ifdef USE_OPENCL
bool ACLOperator::support_opencl_ = false;
bool opencl_is_available() { return arm_compute::opencl_is_available(); }
#elif defined(USE_OPENGLES)
bool ACLOperator::support_opengles_ = false;
#endif
ACLOperator::ACLOperator(bool is_gpu)
: operator_state_(operator_not_init),
force_bypass_acl_path_(false),
target_hint_(TargetHint::DONT_CARE),
convolution_method_hint_(ConvolutionMethodHint::GEMM),
_group(1),
name_(""),
input_idx_(0),
output_idx_(0),
is_gpu_(is_gpu) {
const char* pBypassACL;
if (init_gpu_env) {
#ifdef USE_OPENCL
try {
if (opencl_is_available()) {
arm_compute::CLScheduler::get().default_init();
support_opencl_ = true;
}
} catch (std::exception& e) {
support_opencl_ = false;
}
#elif defined(USE_OPENGLES)
try {
arm_compute::GCScheduler::get().default_init();
support_opengles_ = true;
} catch (std::exception& e) {
support_opengles_ = false;
}
#endif
init_gpu_env = false;
}
if (force_enable_gpu) is_gpu_ = true;
pBypassACL = getenv("BYPASSACL");
if (pBypassACL) {
unsigned int bacl;
sscanf(pBypassACL, "%i", &bacl);
if (bacl != bypass_acl_class_layer) {
bypass_acl_class_layer = bacl;
printf("BYPASSACL<%s>\n", pBypassACL);
printf("BYPASSACL: %x\n", bypass_acl_class_layer);
}
}
#ifdef USE_PROFILING
const char* pLogACL;
pLogACL = getenv("LOGACL");
if (pLogACL) {
unsigned int alf;
sscanf(pLogACL, "%i", &alf);
if (alf != acl_log_flags) {
acl_log_flags = alf;
printf("LOGACL<%s>\n", pLogACL);
printf("LOGACL: %x\n", acl_log_flags);
}
}
#endif // USE_PROFILING
const char* pEnableSchedule;
pEnableSchedule = getenv("ENABLESCHEDULE");
if (pEnableSchedule) {
int bshedule;
sscanf(pEnableSchedule, "%i", &bshedule);
if (bshedule != enable_schedule) {
enable_schedule = bshedule;
printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
printf("ENABLESCHEDULE: %x\n", enable_schedule);
}
if (enable_schedule) {
AclEnableSchedule(1);
}
}
}
ACLOperator::~ACLOperator() {}
bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
arm_compute::TensorShape& shape, void* mem,
bool commit) {
auto acl_tensor =
new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
acl_tensor->set_target(getTargetHint());
acl_tensor->bindmem(mem);
if (commit) acl_tensor->commit();
tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
return true;
}
bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
std::unique_ptr<ACLTensor>& parent,
arm_compute::TensorShape& shape,
arm_compute::Coordinates& coord) {
auto acl_tensor = new ACLSubTensor(parent, shape, coord);
acl_tensor->set_target(getTargetHint());
tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
return true;
}
void ACLTensor::commit(TensorType type) {
settensortype(type);
if (mem_) {
if (!allocate_) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_ALLOCATE_INFO);
#endif // USE_PROFILING
allocate();
allocate_ = true;
}
if (type_ != tensor_output) {
tensor_copy(mem_);
}
mem_ = nullptr;
}
}
int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
bool toTensor) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_COPY_INFO);
#endif // USE_PROFILING
arm_compute::Window window;
// Iterate through the rows (not each element)
window.use_tensor_dimensions(tensor->info()->tensor_shape(),
/* first_dimension =*/arm_compute::Window::DimY);
int width = tensor->info()->tensor_shape()[0];
int height = tensor->info()->tensor_shape()[1];
int deepth = tensor->info()->tensor_shape()[2];
map();
// Create an iterator:
arm_compute::Iterator it(tensor, window);
// Except it works for an arbitrary number of dimensions
if (toTensor) { // mem->tensor
arm_compute::execute_window_loop(
window,
[&](const arm_compute::Coordinates& id) {
memcpy(it.ptr(),
((char*)mem) +
((id[3] * (width * height * deepth) +
id.z() * (width * height) + id.y() * width + id.x()) *
tensor->info()->element_size()),
width * tensor->info()->element_size());
},
it);
} else { // tensor-->mem
arm_compute::execute_window_loop(
window,
[&](const arm_compute::Coordinates& id) {
memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
id.z() * (width * height) + id.y() * width) *
tensor->info()->element_size()),
it.ptr(), width * tensor->info()->element_size());
},
it);
}
unmap();
return 0;
}
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef ACL_OPERATOR_H_
#define ACL_OPERATOR_H_
#include <framework/tensor.h>
#include <operators/op_param.h>
#if USE_ACL == 1
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
#include "arm_compute/runtime/Tensor.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
#endif
#ifdef USE_OPENGLES
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
#endif
#include "acl_tensor.h"
#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
#define FLAGS_ENABLE_ACL_BNLL 0x00000002
#define FLAGS_ENABLE_ACL_CONV 0x00000004
#define FLAGS_ENABLE_ACL_FC 0x00000008
#define FLAGS_ENABLE_ACL_LRN 0x00000010
#define FLAGS_ENABLE_ACL_POOLING 0x00000020
#define FLAGS_ENABLE_ACL_RELU 0x00000040
#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
#define FLAGS_ENABLE_ACL_TANH 0x00000200
#define FLAGS_ENABLE_ACL_LC 0x00000400
#define FLAGS_ENABLE_ACL_BN 0x00000800
#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
extern unsigned int bypass_acl_class_layer;
#ifdef USE_PROFILING
#include <sys/time.h>
#define NANO_SEC_CONV 1000000
#define MASK_LOG_APP_TIME 0x00000001
#define MASK_LOG_ALLOCATE 0x00000002
#define MASK_LOG_RUN 0x00000004
#define MASK_LOG_CONFIG 0x00000008
#define MASK_LOG_COPY 0x00000010
#define MASK_LOG_ABSVAL 0x00000020
#define MASK_LOG_BNLL 0x00000040
#define MASK_LOG_CONV 0x00000080
#define MASK_LOG_FC 0x00000100
#define MASK_LOG_LRN 0x00000200
#define MASK_LOG_POOLING 0x00000400
#define MASK_LOG_RELU 0x00000800
#define MASK_LOG_SIGMOID 0x00001000
#define MASK_LOG_SOFTMAX 0x00002000
#define MASK_LOG_TANH 0x00004000
#define MASK_LOG_LC 0x00008000
#define MASK_LOG_BN 0x00010000
#define MASK_LOG_CONCAT 0x00020000
#define APP_TIME_INFO MASK_LOG_APP_TIME, "time: \t"
#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate: \t\t"
#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t"
#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t"
#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t"
#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t"
#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t"
#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t"
#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_SIGMOID_INFO \
MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_SOFTMAX_INFO \
MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_TANH_INFO \
MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_BN_INFO \
MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_CONCAT_INFO \
MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
extern unsigned int acl_log_flags;
class logtime_util {
public:
logtime_util() { mask = 0; }
logtime_util(int mask_, const char *information_) {
setlogtime_info(mask_, information_);
}
void setlogtime_info(int mask_, const char *information_) {
mask = mask_;
if (acl_log_flags & mask) {
strncpy(information, information_, 255);
gettimeofday(&tv[0], NULL);
}
}
~logtime_util() {
if (acl_log_flags & mask) {
int time[2];
gettimeofday(&tv[1], NULL);
time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
printf("%s %.6lf\n", information,
(((double)time[1] - time[0]) / NANO_SEC_CONV));
}
}
void log_time(bool start) {
if (acl_log_flags & mask) {
if (start) {
gettimeofday(&tv[0], NULL);
} else {
int time[2];
gettimeofday(&tv[1], NULL);
time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
printf("%s %.6lf\n", information,
(((double)time[1] - time[0]) / NANO_SEC_CONV));
}
}
}
private:
struct timeval tv[2];
int mask;
char information[256];
};
#endif // USE_PROFILING
namespace paddle_mobile {
namespace operators {
namespace acl {
class AclParameters {
public:
AclParameters() {
dilated = false;
dim = 2;
num_group = 1;
}
int batch;
int in_depth;
int in_rows;
int in_cols;
int out_depth;
int out_rows;
int out_cols;
int out_num;
int filter_rows;
int filter_cols;
int stride_rows;
int stride_cols;
int pad_rows;
int pad_cols;
int dilation_rows;
int dilation_cols;
int num_group;
bool dilated;
int dim;
int epsilon;
int nsize;
float alpha;
float beta;
float knorm;
void *input_data;
void *output_data;
void *weight_data;
void *biases_data;
void *mean_data;
void *var_data;
std::string pool_type;
std::string act_type;
std::string data_layout;
bool is_global_pool;
bool is_channel_concat;
std::vector<framework::LoDTensor *> in_tensor;
};
enum TensorType {
tensor_input,
tensor_output,
tensor_weights,
tensor_biases,
tensor_mean,
tensor_var,
tensor_beta,
tensor_gamma,
tensor_concat,
tensor_data,
};
enum OperatorState {
operator_not_init,
operator_init_done,
operator_reinit,
};
enum OperateType {
operate_type_pooling,
operate_type_activation,
operate_type_lrn,
operate_type_conv,
operate_type_lc,
operate_type_fc,
operate_type_bn,
operate_type_softmax,
operate_type_concat,
};
class BaseACLTensor {
public:
BaseACLTensor() : type_(tensor_input), allocate_(false) {}
virtual ~BaseACLTensor() {}
virtual void bindmem(void *mem) { mem_ = mem; }
virtual void settensortype(TensorType type) { type_ = type; }
virtual void map(bool blocking = true) {}
virtual void unmap() {}
virtual void commit(TensorType type = tensor_data) {}
int tensor_copy(arm_compute::ITensor *tensor, void *mem,
bool toTensor = true);
protected:
void *mem_;
TensorType type_;
bool allocate_;
};
class ACLTensor : public BaseACLTensor, public Tensor {
public:
explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {}
virtual void map(bool blocking = true) {
if (!allocate_) {
Tensor::allocate();
allocate_ = true;
}
Tensor::map(blocking);
}
virtual int tensor_copy(void *mem, bool toTensor = true) {
auto acl_tensor = this;
arm_compute::ITensor *tensor = acl_tensor->tensor();
BaseACLTensor::tensor_copy(tensor, mem, toTensor);
return 0;
}
virtual void unmap() { Tensor::unmap(); }
virtual void commit(TensorType type = tensor_data);
};
class ACLSubTensor : public BaseACLTensor, public SubTensor {
public:
ACLSubTensor(std::unique_ptr<ACLTensor> &parent,
arm_compute::TensorShape &shape, arm_compute::Coordinates &coord)
: SubTensor(parent.get(), shape, coord) {}
virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; }
};
template <typename T>
class TensorPair {
public:
TensorPair() {}
~TensorPair() {}
TensorType type;
std::unique_ptr<T> tensor;
};
template <typename T>
std::unique_ptr<T> &tensor_item(
std::vector<std::unique_ptr<TensorPair<T>>> &pool, TensorType type,
int idx) {
int count = 0;
for (auto &item : pool) {
if (item.get()->type == type) {
++count;
}
if (item.get()->type == type && idx == count - 1) {
return item.get()->tensor;
}
}
pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
auto item = pool[pool.size() - 1].get();
item->type = type;
item->tensor = NULL;
return item->tensor;
}
class ACLOperator {
public:
virtual void commit() {
for (auto &item : tensor_pool_) {
if (item.get()->tensor) item.get()->tensor->commit(item.get()->type);
}
}
inline void run() {
commit();
#ifdef USE_PROFILING
logtime_util log_time(ACL_RUN_INFO);
#endif // USE_PROFILING
for (auto &c : funcs_) {
c->run();
}
}
inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs() {
return funcs_;
}
inline std::unique_ptr<ACLSubTensor> &sinput(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_input, idx);
}
inline std::unique_ptr<ACLSubTensor> &soutput(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_output, idx);
}
inline std::unique_ptr<ACLSubTensor> &sweights(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_weights, idx);
}
inline std::unique_ptr<ACLSubTensor> &sbiases(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_biases, idx);
}
inline std::unique_ptr<ACLTensor> &cinput(int idx = 0) {
return tensor_item(tensor_pool_, tensor_concat, idx);
}
inline std::unique_ptr<ACLTensor> &input(int idx = 0) {
return tensor_item(tensor_pool_, tensor_input, idx);
}
inline std::unique_ptr<ACLTensor> &output(int idx = 0) {
return tensor_item(tensor_pool_, tensor_output, idx);
}
inline std::unique_ptr<ACLTensor> &weights(int idx = 0) {
return tensor_item(tensor_pool_, tensor_weights, idx);
}
inline std::unique_ptr<ACLTensor> &biases(int idx = 0) {
return tensor_item(tensor_pool_, tensor_biases, idx);
}
inline std::unique_ptr<ACLTensor> &mean(int idx = 0) {
return tensor_item(tensor_pool_, tensor_mean, idx);
}
inline std::unique_ptr<ACLTensor> &var(int idx = 0) {
return tensor_item(tensor_pool_, tensor_var, idx);
}
inline std::unique_ptr<ACLTensor> &beta(int idx = 0) {
return tensor_item(tensor_pool_, tensor_beta, idx);
}
inline std::unique_ptr<ACLTensor> &gamma(int idx = 0) {
return tensor_item(tensor_pool_, tensor_gamma, idx);
}
inline std::unique_ptr<ACLTensor> &tensor(TensorType type) {
switch (type) {
case tensor_biases:
return biases();
break;
case tensor_weights:
return weights();
break;
case tensor_output:
return output();
break;
default:
case tensor_input:
return input();
break;
}
return input();
}
explicit ACLOperator(bool is_gpu = false);
virtual ~ACLOperator();
inline TargetHint getTargetHint() {
#ifdef USE_OPENCL
if (target_hint_ == TargetHint::DONT_CARE) {
if (is_gpu_) {
return TargetHint::OPENCL;
}
return TargetHint::NEON;
}
return target_hint_;
#elif defined(USE_OPENGLES)
if (target_hint_ == TargetHint::DONT_CARE) {
if (is_gpu_) {
return TargetHint::OPENGLES;
}
return TargetHint::NEON;
}
return target_hint_;
#else
return TargetHint::NEON;
#endif
}
inline void setTargetHint(TargetHint hint) { target_hint_ = hint; }
inline ConvolutionMethodHint &getConvMethod() {
return convolution_method_hint_;
}
inline void setConvMethod() {
convolution_method_hint_ = ConvolutionMethodHint::DIRECT;
}
inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor, void *mem) {
tensor->bindmem(mem);
return true;
}
inline bool tensor_mem(void *mem, std::unique_ptr<ACLTensor> &tensor) {
tensor->tensor_copy(mem, false);
return true;
}
bool new_tensor(std::unique_ptr<ACLTensor> &tensor,
arm_compute::TensorShape &shape, void *mem = nullptr,
bool commit = false);
bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,
std::unique_ptr<ACLTensor> &parent,
arm_compute::TensorShape &shape,
arm_compute::Coordinates &coord);
inline int &group() { return _group; }
inline void set_operator_property(OperateType type, const char *name) {
name_ = name;
type_ = type;
}
inline void acl_run(void *input_data, void *output_data) {
if (input_data) tensor_mem(input(), input_data);
run();
tensor_mem(output_data, output());
}
inline int &input_idx() { return input_idx_; }
inline int &output_idx() { return output_idx_; }
protected:
inline bool isGPUMode() {
#ifdef USE_OPENCL
if (!support_opencl_) return false;
return getTargetHint() == TargetHint::OPENCL;
#elif defined(USE_OPENGLES)
if (!support_opengles_) return false;
return getTargetHint() == TargetHint::OPENGLES;
#endif
return false;
}
inline OperatorState &opstate() { return operator_state_; }
inline bool is_operator_init_done(arm_compute::TensorShape shape,
TensorType type = tensor_input) {
checkreshape(shape, type);
return operator_state_ == operator_init_done;
}
inline void set_operator_init_done() {
opstate() = operator_init_done;
set_bypass_state(false);
}
inline void set_bypass_state(bool state = false) {
force_bypass_acl_path_ = state;
}
inline OperatorState checkreshape(arm_compute::TensorShape shape,
TensorType type = tensor_input) {
opstate() = reshape(shape, type);
if (opstate() == operator_reinit) {
freeres();
}
return opstate();
}
inline OperatorState reshape(arm_compute::TensorShape &shape,
TensorType type) {
arm_compute::TensorShape _shape;
std::unique_ptr<ACLTensor> &acl_tensor = tensor(type);
if (!acl_tensor.get()) return operator_not_init;
_shape = acl_tensor->info().tensor_shape();
if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] &&
_shape[1] == shape[1]) {
return operator_init_done;
}
return operator_reinit;
}
inline void freeres() {
tensor_pool_.clear();
subtensor_pool_.clear();
funcs_.clear();
}
inline const char *&name() { return name_; }
inline void set_in_out_index(int indata_idx, int outdata_idx) {
input_idx() = indata_idx;
output_idx() = outdata_idx;
}
protected:
std::vector<std::unique_ptr<TensorPair<ACLTensor>>> tensor_pool_;
std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>> subtensor_pool_;
std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
OperatorState operator_state_;
bool force_bypass_acl_path_;
TargetHint target_hint_;
ConvolutionMethodHint convolution_method_hint_;
static bool support_opengles_;
static bool support_opencl_;
static bool init_gpu_env;
int _group;
const char *name_;
OperateType type_;
int input_idx_, output_idx_;
bool is_gpu_;
};
int isScheduleEnable();
template <typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *output) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(dynamic_cast<TensorType *>(input),
dynamic_cast<TensorType *>(output));
return std::move(op);
}
template <typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *output) {
return instantiate_function<OperatorType, TensorType>(input, output);
}
template <typename OpType, typename OpTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
func = instantiate<OpType, OpTensor>(input->tensor(), output->tensor());
return func;
}
template <typename OperatorType, typename TensorType, typename VectorTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
VectorTensor inputs, arm_compute::ITensor *output) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(inputs, dynamic_cast<TensorType *>(output));
return std::move(op);
}
template <typename OperatorType, typename TensorType, typename VectorTensor>
std::unique_ptr<arm_compute::IFunction> instantiate(
VectorTensor inputs, arm_compute::ITensor *output) {
return instantiate_function<OperatorType, TensorType, VectorTensor>(inputs,
output);
}
template <typename OpType, typename OpTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(
ACLOperator *&acl_op, std::unique_ptr<ACLTensor> &output, int num,
TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
static std::vector<OpTensor *> tensors;
tensors.clear();
for (int i = 0; i < num; ++i) {
tensors.push_back(
dynamic_cast<OpTensor *>(acl_op->cinput(i).get()->tensor()));
}
func = instantiate<OpType, OpTensor, std::vector<OpTensor *>>(
tensors, output->tensor());
return func;
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *output,
const OperatorInfo &info) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(dynamic_cast<TensorType *>(input),
dynamic_cast<TensorType *>(output), info);
return std::move(op);
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *output,
const OperatorInfo &info) {
return instantiate_function<OperatorType, TensorType, OperatorInfo>(
input, output, info);
}
template <typename OpType, typename OpTensor, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
const OperatorInfo &info, TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
func = instantiate<OpType, OpTensor, OperatorInfo>(input->tensor(),
output->tensor(), info);
return func;
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *weights,
arm_compute::ITensor *biases, arm_compute::ITensor *output,
const OperatorInfo &info) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(dynamic_cast<TensorType *>(input),
dynamic_cast<TensorType *>(weights),
dynamic_cast<TensorType *>(biases),
dynamic_cast<TensorType *>(output), info);
return std::move(op);
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *weights,
arm_compute::ITensor *biases, arm_compute::ITensor *output,
const OperatorInfo &info) {
return instantiate_function<OperatorType, TensorType, OperatorInfo>(
input, weights, biases, output, info);
}
template <typename OpType, typename OpTensor, typename OperatorInfo,
typename ACLTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &weights,
std::unique_ptr<ACLTensor> &biases, std::unique_ptr<ACLTensor> &output,
const OperatorInfo &info, TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
arm_compute::ITensor *biases_tensor = NULL;
if (biases.get()) {
biases_tensor = biases->tensor();
}
func = instantiate<OpType, OpTensor, OperatorInfo>(
input->tensor(), weights->tensor(), biases_tensor, output->tensor(),
info);
return func;
}
template <typename Dtype, typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *output,
arm_compute::ITensor *mean, arm_compute::ITensor *var,
arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(
dynamic_cast<TensorType *>(input), dynamic_cast<TensorType *>(output),
dynamic_cast<TensorType *>(mean), dynamic_cast<TensorType *>(var),
dynamic_cast<TensorType *>(beta), dynamic_cast<TensorType *>(gamma), eps);
return std::move(op);
}
template <typename Dtype, typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *output,
arm_compute::ITensor *mean, arm_compute::ITensor *var,
arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) {
return instantiate_function<Dtype, OperatorType, TensorType>(
input, output, mean, var, beta, gamma, eps);
}
template <typename Dtype, typename OpType, typename OpTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
std::unique_ptr<ACLTensor> &mean, std::unique_ptr<ACLTensor> &var,
std::unique_ptr<ACLTensor> &beta, std::unique_ptr<ACLTensor> &gamma,
Dtype eps, TargetHint hint) {
std::unique_ptr<arm_compute::IFunction> func;
func = instantiate<Dtype, OpType, OpTensor>(
input->tensor(), output->tensor(), mean->tensor(), var->tensor(),
beta->tensor(), gamma->tensor(), eps);
return func;
}
template <typename OperatorInfo>
bool instantiate_op_pooling(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor,
arm_compute::PoolingLayerInfo>(input, output, info,
hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<arm_compute::GCPoolingLayer, arm_compute::IGCTensor,
arm_compute::PoolingLayerInfo>(input, output, info,
hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NEPoolingLayer, arm_compute::ITensor,
arm_compute::PoolingLayerInfo>(input, output, info,
hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_activation(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,
arm_compute::ICLTensor,
arm_compute::ActivationLayerInfo>(
input, output, info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCActivationLayer,
arm_compute::IGCTensor,
arm_compute::ActivationLayerInfo>(
input, output, info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NEActivationLayer,
arm_compute::ITensor,
arm_compute::ActivationLayerInfo>(
input, output, info, hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_lrn(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,
arm_compute::ICLTensor,
arm_compute::NormalizationLayerInfo>(
input, output, info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCNormalizationLayer,
arm_compute::IGCTensor,
arm_compute::NormalizationLayerInfo>(
input, output, info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NENormalizationLayer,
arm_compute::ITensor,
arm_compute::NormalizationLayerInfo>(
input, output, info, hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_conv(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
std::unique_ptr<ACLTensor> &weights = acl_op->weights();
std::unique_ptr<ACLTensor> &biases = acl_op->biases();
ConvolutionMethodHint &conv_method = acl_op->getConvMethod();
bool has_biases = biases.get() ? true : false;
int &groups = acl_op->group();
arm_compute::TensorShape input_shape = input->info().tensor_shape();
arm_compute::TensorShape weights_shape = weights->info().tensor_shape();
arm_compute::TensorShape biases_shape;
if (has_biases) {
biases_shape = biases->info().tensor_shape();
}
arm_compute::TensorShape output_shape = output->info().tensor_shape();
if (groups == 1) {
if (conv_method == ConvolutionMethodHint::GEMM) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCConvolutionLayer,
arm_compute::IGCTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NEConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
}
} else {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<arm_compute::GCDirectConvolutionLayer,
arm_compute::IGCTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
}
}
return true;
}
// Calculate sub-tensor splits
const int input_split = input_shape.z() / groups;
const int output_split = output_shape.z() / groups;
const int weights_split = weights_shape[3] / groups;
const int biases_split = biases_shape.x() / groups;
// Calculate sub-tensor shapes
input_shape.set(2, input_split);
output_shape.set(2, output_split);
weights_shape.set(3, weights_split);
biases_shape.set(0, biases_split);
for (auto i = 0; i < groups; ++i) {
// Calculate sub-tensors starting coordinates
arm_compute::Coordinates input_coord(0, 0, input_split * i);
arm_compute::Coordinates output_coord(0, 0, output_split * i);
arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
arm_compute::Coordinates biases_coord(biases_split * i);
// Create sub-tensors for input, output, weights and bias
acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape,
input_coord);
acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape,
output_coord);
acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape,
weights_coord);
if (has_biases) {
acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape,
biases_coord);
}
bool use_opencl = false;
if (conv_method == ConvolutionMethodHint::GEMM) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
use_opencl = true;
func.push_back(
instantiate_op_func<arm_compute::CLConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
#endif
if (!use_opencl) {
func.push_back(
instantiate_op_func<arm_compute::NEConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
} else {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
use_opencl = true;
func.push_back(
instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
#endif
if (!use_opencl) {
func.push_back(
instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
}
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_lc(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
std::unique_ptr<ACLTensor> &weights = acl_op->weights();
std::unique_ptr<ACLTensor> &biases = acl_op->biases();
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLLocallyConnectedLayer,
arm_compute::ICLTensor, arm_compute::PadStrideInfo>(
input, weights, biases, output, info, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NELocallyConnectedLayer,
arm_compute::ITensor, arm_compute::PadStrideInfo>(
input, weights, biases, output, info, hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_fc(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
std::unique_ptr<ACLTensor> &weights = acl_op->weights();
std::unique_ptr<ACLTensor> &biases = acl_op->biases();
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,
arm_compute::ICLTensor, bool>(
input, weights, biases, output, info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCFullyConnectedLayer,
arm_compute::IGCTensor, bool>(
input, weights, biases, output, info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NEFullyConnectedLayer,
arm_compute::ITensor, bool>(
input, weights, biases, output, info, hint));
}
return true;
}
template <typename Dtype>
bool instantiate_op_bn(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, Dtype eps) {
std::unique_ptr<ACLTensor> &mean = acl_op->mean();
std::unique_ptr<ACLTensor> &var = acl_op->var();
std::unique_ptr<ACLTensor> &beta = acl_op->beta();
std::unique_ptr<ACLTensor> &gamma = acl_op->gamma();
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<Dtype, arm_compute::CLBatchNormalizationLayer,
arm_compute::ICLTensor>(input, output, mean, var,
beta, gamma, eps, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<Dtype, arm_compute::GCBatchNormalizationLayer,
arm_compute::IGCTensor>(input, output, mean, var,
beta, gamma, eps, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<Dtype, arm_compute::NEBatchNormalizationLayer,
arm_compute::ITensor>(input, output, mean, var,
beta, gamma, eps, hint));
}
return true;
}
inline bool instantiate_op_softmax(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, void *data) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLSoftmaxLayer,
arm_compute::ICLTensor>(input, output, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<arm_compute::GCSoftmaxLayer,
arm_compute::IGCTensor>(input, output, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NESoftmaxLayer, arm_compute::ITensor>(
input, output, hint));
}
return true;
}
inline bool instantiate_op_concat(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, int num) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,
arm_compute::ICLTensor>(acl_op, output, num,
hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func_lists<arm_compute::GCDepthConcatenateLayer,
arm_compute::IGCTensor>(acl_op, output, num,
hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func_lists<arm_compute::NEDepthConcatenateLayer,
arm_compute::ITensor>(acl_op, output, num,
hint));
}
return true;
}
template <typename Dtype>
void *InputdataPtr(ACLOperator *op,
const std::vector<framework::LoDTensor *> &input_data,
Dtype type, int index = -1) {
if (index == -1) index = 0;
return (void *)(input_data[index]->mutable_data<Dtype>());
}
template <typename Dtype>
void acl_run(ACLOperator *op,
const std::vector<framework::LoDTensor *> &in_data, void *out_data,
Dtype type, bool multi_input_run = true) {
for (int i = 0; i < in_data.size(); ++i) {
op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i));
}
op->acl_run(NULL, out_data);
}
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
#ifdef USE_PROFILING
#define acl_configure(opname, acl_op, args...) \
{ \
set_operator_property(acl::operate_type_##opname, #opname); \
logtime_util log_time(ACL_CONFIG_INFO); \
instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \
acl_op->output(), acl_op->getTargetHint(), args); \
}
#else
#define acl_configure(opname, acl_op, args...) \
{ \
set_operator_property(acl::operate_type_##opname, #opname); \
instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \
acl_op->output(), acl_op->getTargetHint(), args); \
}
#endif
#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
#endif // USE_ACL
#endif // ACL_OPERATOR_H_
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "acl_tensor.h"
namespace paddle_mobile {
namespace operators {
namespace acl {
#ifdef USE_ACL
template <typename TensorType>
std::unique_ptr<arm_compute::ITensor> initialise_tensor(
arm_compute::TensorInfo &info) {
auto tensor = cpp14::make_unique<TensorType>();
tensor->allocator()->init(info);
return std::move(tensor);
}
template <typename TensorType>
void tensor_allocate(arm_compute::ITensor &tensor) {
auto itensor = dynamic_cast<TensorType *>(&tensor);
itensor->allocator()->allocate();
}
Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
: _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
Tensor::Tensor(Tensor &&src) noexcept
: _target(src._target),
_info(std::move(src._info)),
_tensor(std::move(src._tensor)) {}
arm_compute::ITensor *Tensor::set_target(TargetHint target) {
switch (target) {
#ifdef USE_OPENCL
case TargetHint::OPENCL:
_tensor = initialise_tensor<arm_compute::CLTensor>(_info);
break;
#elif defined(USE_OPENGLES)
case TargetHint::OPENGLES:
_tensor = initialise_tensor<arm_compute::GCTensor>(_info);
break;
#endif
case TargetHint::NEON:
_tensor = initialise_tensor<arm_compute::Tensor>(_info);
break;
default:
break;
}
_target = target;
return _tensor.get();
}
void Tensor::allocate() {
switch (_target) {
#ifdef USE_OPENCL
case TargetHint::OPENCL:
tensor_allocate<arm_compute::CLTensor>(*_tensor);
break;
#elif defined(USE_OPENGLES)
case TargetHint::OPENGLES:
tensor_allocate<arm_compute::GCTensor>(*_tensor);
break;
#endif
case TargetHint::NEON:
tensor_allocate<arm_compute::Tensor>(*_tensor);
break;
default:
break;
}
}
void Tensor::map(bool blocking) {
#ifdef USE_OPENCL
if (_target == TargetHint::OPENCL)
dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
#elif defined(USE_OPENGLES)
if (_target == TargetHint::OPENGLES)
dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
#endif
}
void Tensor::unmap() {
#ifdef USE_OPENCL
if (_target == TargetHint::OPENCL)
dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
#elif defined(USE_OPENGLES)
if (_target == TargetHint::OPENGLES)
dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
#endif
}
template <typename SubTensorType, typename ParentTensorType>
std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
arm_compute::ITensor *parent, arm_compute::TensorShape shape,
arm_compute::Coordinates coords) {
auto ptensor = dynamic_cast<ParentTensorType *>(parent);
auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
return std::move(subtensor);
}
SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
arm_compute::Coordinates &coords) noexcept
: _target(TargetHint::DONT_CARE),
_tensor_shape(tensor_shape),
_coords(coords),
_parent(nullptr),
_subtensor(nullptr) {
_parent = parent->tensor();
_target = parent->target();
instantiate_subtensor();
}
arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
return (target == _target) ? _subtensor.get() : nullptr;
}
arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
const arm_compute::ITensor *SubTensor::tensor() const {
return _subtensor.get();
}
TargetHint SubTensor::target() const { return _target; }
void SubTensor::allocate() {
// NOP for sub-tensors
}
void SubTensor::instantiate_subtensor() {
switch (_target) {
#ifdef USE_OPENCL
case TargetHint::OPENCL:
_subtensor = initialise_subtensor<arm_compute::CLSubTensor,
arm_compute::ICLTensor>(
_parent, _tensor_shape, _coords);
break;
#endif
default:
case TargetHint::NEON:
_subtensor =
initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
_parent, _tensor_shape, _coords);
break;
}
}
#endif
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef ACL_TENSOR_H_
#define ACL_TENSOR_H_
#ifdef USE_ACL
#ifdef USE_OPENCL
#include "arm_compute/runtime/CL/CLSubTensor.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#elif defined(USE_OPENGLES)
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#endif
#include "arm_compute/runtime/SubTensor.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace paddle_mobile {
namespace operators {
namespace acl {
enum class TargetHint {
DONT_CARE,
OPENCL,
OPENGLES,
NEON,
};
enum class ConvolutionMethodHint {
GEMM,
DIRECT,
};
namespace cpp14 {
template <class T>
struct _Unique_if {
typedef std::unique_ptr<T> _Single_object;
};
template <class T>
struct _Unique_if<T[]> {
typedef std::unique_ptr<T[]> _Unknown_bound;
};
template <class T, size_t N>
struct _Unique_if<T[N]> {
typedef void _Known_bound;
};
template <class T, class... Args>
typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}
template <class T>
typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
typedef typename std::remove_extent<T>::type U;
return std::unique_ptr<T>(new U[n]());
}
template <class T, class... Args>
typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
} // namespace cpp14
class Tensor {
public:
explicit Tensor(arm_compute::TensorInfo &info) noexcept;
virtual ~Tensor() {}
Tensor(Tensor &&src) noexcept;
void set_info(arm_compute::TensorInfo &&info) { _info = info; }
arm_compute::ITensor *set_target(TargetHint target);
const arm_compute::TensorInfo &info() const { return _info; }
arm_compute::ITensor *tensor() { return _tensor.get(); }
void allocate();
void init() {}
TargetHint target() const { return _target; }
virtual void map(bool blocking = true);
virtual void unmap();
private:
TargetHint _target;
arm_compute::TensorInfo _info;
std::unique_ptr<arm_compute::ITensor> _tensor;
};
class SubTensor {
public:
SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
arm_compute::Coordinates &coords) noexcept;
~SubTensor() {}
arm_compute::ITensor *tensor();
const arm_compute::ITensor *tensor() const;
TargetHint target() const;
void allocate();
arm_compute::ITensor *set_target(TargetHint target);
private:
/** Instantiates a sub-tensor */
void instantiate_subtensor();
private:
/**< Target that this tensor is pinned on */
TargetHint _target;
/**< SubTensor shape */
arm_compute::TensorShape _tensor_shape;
/**< SubTensor Coordinates */
arm_compute::Coordinates _coords;
/**< Parent tensor */
arm_compute::ITensor *_parent;
/**< SubTensor */
std::unique_ptr<arm_compute::ITensor> _subtensor;
};
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
#endif
#endif // ACL_TENSOR_H_
...@@ -15,20 +15,152 @@ limitations under the License. */ ...@@ -15,20 +15,152 @@ limitations under the License. */
#ifdef BATCHNORM_OP #ifdef BATCHNORM_OP
#include "operators/kernel/batchnorm_kernel.h" #include "operators/kernel/batchnorm_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T>
class AclBatchNormOp : public acl::ACLOperator {
public:
AclBatchNormOp() {
this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
}
~AclBatchNormOp() = default;
AclBatchNormOp(const AclBatchNormOp&) = delete;
AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
AclBatchNormOp(AclBatchNormOp&&) = delete;
AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const BatchNormParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth, args.batch);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.out_num);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
arm_compute::TensorShape mean_shape(args.in_depth);
arm_compute::TensorShape var_shape = mean_shape;
arm_compute::TensorShape beta_shape = mean_shape;
arm_compute::TensorShape gamma_shape = mean_shape;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
new_tensor(mean(), mean_shape, args.mean_data);
new_tensor(var(), var_shape, args.var_data);
new_tensor(beta(), beta_shape, args.biases_data);
new_tensor(gamma(), gamma_shape, args.weight_data);
acl_configure(bn, this, args.epsilon);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const BatchNormParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const BatchNormParam& param) {
const Tensor* in_x = param.InputX();
Tensor* out = param.OutputY();
const Tensor* scale = param.InputScale();
const Tensor* bias = param.InputBias();
const Tensor* saved_mean = param.InputMean();
const Tensor* saved_variance = param.InputVariance();
const T* input_data = in_x->data<T>();
T* output_data = out->mutable_data<T>();
const T* weight_data = scale->data<T>();
const T* bias_data = bias->data<T>();
const T* mean_data = saved_mean->data<T>();
const T* var_data = saved_variance->data<T>();
float epsilon = param.Epsilon();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
// args.weight_data = (void*)weight_data;
// args.biases_data = (void*)bias_data;
args.mean_data = (void*)mean_data;
args.var_data = (void*)var_data;
args.epsilon = epsilon;
args.dim = in_x->dims().size();
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.in_rows = in_x->dims()[2];
args.in_cols = in_x->dims()[3];
args.out_num = out->dims()[0];
args.out_depth = out->dims()[1];
args.out_rows = out->dims()[2];
args.out_cols = out->dims()[3];
args.weight_data = (void*)weight_data;
args.biases_data = (void*)bias_data;
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl::AclParameters args;
};
template <> template <>
bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam &para) const { bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
AclBatchNormOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclBatchNormOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true; return true;
} }
template <> template <>
void BatchNormKernel<GPU_MALI, float>::Compute( void BatchNormKernel<GPU_MALI, float>::Compute(
const BatchNormParam &param) const {} const BatchNormParam& param) const {
std::cout << "init acl" << std::endl;
AclBatchNormOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
}
template class BatchNormKernel<GPU_MALI, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclConcatOp : public acl::ACLOperator {
public:
AclConcatOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
}
~AclConcatOp() = default;
AclConcatOp(const AclConcatOp&) = delete;
AclConcatOp& operator=(const AclConcatOp&) = delete;
AclConcatOp(AclConcatOp&&) = delete;
AclConcatOp& operator=(AclConcatOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const ConcatParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.batch);
if (is_operator_init_done(output_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
T type;
for (int i = 0; i < input_data->size(); i++) {
const T* idata = (*input_data)[i]->data<T>();
const T* pdata = (*input_data)[i]->data<T>();
int in_batch = (*input_data)[i]->dims()[0];
int in_channels = (*input_data)[i]->dims()[1];
int in_width = (*input_data)[i]->dims()[2];
int in_height = (*input_data)[i]->dims()[3];
arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
new_tensor(cinput(i), in_shape,
acl::InputdataPtr(this, args.in_tensor, type, i));
}
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(concat, this, input_data->size());
}
void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
T type;
acl::acl_run(this, input, output, type);
}
bool Bypass_acl(const ConcatParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const ConcatParam& param) {
auto inputs = param.Inputs();
auto* output = param.Out();
int64_t axis = param.Axis();
T* output_data = output->mutable_data<T>();
args.is_channel_concat = (axis == 1);
args.in_tensor = inputs;
args.output_data = (void*)output_data;
args.batch = output->dims()[0];
args.out_depth = output->dims()[1];
args.out_rows = output->dims()[2];
args.out_cols = output->dims()[3];
}
acl::AclParameters args;
};
template <>
bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
AclConcatOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclConcatOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
std::cout << "init acl" << std::endl;
AclConcatOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl(temp_data, (void*)output_data);
}
template class ConcatKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclConvAddOp : public acl::ACLOperator {
public:
AclConvAddOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
}
~AclConvAddOp() = default;
AclConvAddOp(const AclConvAddOp&) = delete;
AclConvAddOp& operator=(const AclConvAddOp&) = delete;
AclConvAddOp(AclConvAddOp&&) = delete;
AclConvAddOp& operator=(AclConvAddOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const FusionConvAddParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth, args.batch);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.out_num);
arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
args.in_depth / args.num_group,
args.out_depth);
arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::PadStrideInfo conv_info(
args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::FLOOR);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
check_direct_conv();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor(weights(), weights_shape, args.weight_data);
//[OFM]
if (args.biases_data) {
new_tensor(biases(), biases_shape, args.biases_data);
}
group() = args.num_group;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(conv, this, conv_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const FusionConvAddParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_ || args.num_group >= 5) {
bypass_acl = true;
}
if (args.dim > 2) {
bypass_acl = true;
}
if (args.dilated) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void check_direct_conv() {
bool use_direct_conv = false;
const char* pDirectConv;
pDirectConv = getenv("DIRECTCONV");
if (pDirectConv) {
unsigned int bdirectconv;
sscanf(pDirectConv, "%i", &bdirectconv);
if (bdirectconv != use_direct_conv) {
use_direct_conv = bdirectconv;
printf("DIRECTCONV<%s>\n", pDirectConv);
printf("DIRECTCONV: %x\n", use_direct_conv);
}
}
int pad_data[2], kernel[2];
pad_data[1] = args.pad_rows;
pad_data[0] = args.pad_cols;
kernel[1] = args.filter_rows;
kernel[0] = args.filter_cols;
if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
pad_data[0] == 0 && pad_data[1] == 0) ||
(kernel[0] == 3 && kernel[1] == 3 &&
pad_data[0] <= 1 && pad_data[1] <= 1))) {
setConvMethod(); // NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void AclParametersByContext(const FusionConvAddParam& param) {
const Tensor* input = param.Input();
Tensor filter = *param.Filter();
Tensor* output = param.Output();
Tensor* bias;
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>();
const T* weight_data = filter.data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.weight_data = (void*)weight_data;
args.biases_data = nullptr;
try {
bias = param.Bias();
} catch (const std::exception& e) {
}
if (bias) {
const T* biases_data = bias->data<T>();
args.biases_data = (void*)biases_data;
}
args.num_group = groups;
args.dilation_rows = dilations[0];
args.dilation_cols = dilations[1];
if (dilations[0] != 1 || dilations[1] != 1) {
args.dilated = true;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args.batch = input->dims()[0];
args.in_depth = input->dims()[1];
args.in_rows = input->dims()[2];
args.in_cols = input->dims()[3];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args.out_num = output->dims()[0];
args.out_depth = output->dims()[1];
args.out_rows = output->dims()[2];
args.out_cols = output->dims()[3];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args.filter_rows = filter.dims()[2];
args.filter_cols = filter.dims()[3];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args.stride_rows = strides[0];
args.stride_cols = strides[1];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args.pad_rows = paddings[0];
args.pad_cols = paddings[1];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl::AclParameters args;
};
template <>
bool ConvAddKernel<GPU_MALI, float>::Init(
const FusionConvAddParam& param) const {
AclConvAddOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclConvAddOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void ConvAddKernel<GPU_MALI, float>::Compute(
const FusionConvAddParam& param) const {
std::cout << "init acl" << std::endl;
AclConvAddOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
}
template class ConvAddKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
...@@ -15,20 +15,213 @@ limitations under the License. */ ...@@ -15,20 +15,213 @@ limitations under the License. */
#ifdef CONV_OP #ifdef CONV_OP
#include "operators/kernel/conv_kernel.h" #include "operators/kernel/conv_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T>
class AclConvOp : public acl::ACLOperator {
public:
AclConvOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
}
~AclConvOp() = default;
AclConvOp(const AclConvOp&) = delete;
AclConvOp& operator=(const AclConvOp&) = delete;
AclConvOp(AclConvOp&&) = delete;
AclConvOp& operator=(AclConvOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const ConvParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth, args.batch);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.out_num);
arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
args.in_depth / args.num_group,
args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::PadStrideInfo conv_info(
args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::FLOOR);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
check_direct_conv();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor(weights(), weights_shape, args.weight_data);
//[OFM]
// if (args.biases_data) {
// new_tensor(biases(),biases_shape,args.biases_data);
//}
group() = args.num_group;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(conv, this, conv_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const ConvParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_ || args.num_group >= 5) {
bypass_acl = true;
}
if (args.dim > 2) {
bypass_acl = true;
}
if (args.dilated) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void check_direct_conv() {
bool use_direct_conv = false;
const char* pDirectConv;
pDirectConv = getenv("DIRECTCONV");
if (pDirectConv) {
unsigned int bdirectconv;
sscanf(pDirectConv, "%i", &bdirectconv);
if (bdirectconv != use_direct_conv) {
use_direct_conv = bdirectconv;
printf("DIRECTCONV<%s>\n", pDirectConv);
printf("DIRECTCONV: %x\n", use_direct_conv);
}
}
int pad_data[2], kernel[2];
pad_data[1] = args.pad_rows;
pad_data[0] = args.pad_cols;
kernel[1] = args.filter_rows;
kernel[0] = args.filter_cols;
if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
pad_data[0] == 0 && pad_data[1] == 0) ||
(kernel[0] == 3 && kernel[1] == 3 &&
pad_data[0] <= 1 && pad_data[1] <= 1))) {
setConvMethod(); // NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void AclParametersByContext(const ConvParam& param) {
const Tensor* input = param.Input();
Tensor filter = *param.Filter();
Tensor* output = param.Output();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>();
const T* weight_data = filter.data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.weight_data = (void*)weight_data;
args.biases_data = nullptr;
// try {
// bias = context.Input<framework::Tensor>("Bias");
// } catch (const std::exception& e) {
// }
// if (bias) {
// const T* biases_data = bias->data<T>();
// args.biases_data = (void*)biases_data;
// }
args.num_group = groups;
args.dilation_rows = dilations[0];
args.dilation_cols = dilations[1];
if (dilations[0] != 1 || dilations[1] != 1) {
args.dilated = true;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args.batch = input->dims()[0];
args.in_depth = input->dims()[1];
args.in_rows = input->dims()[2];
args.in_cols = input->dims()[3];
std::cout << "In N: " << args.batch << " C: " << args.in_depth
<< " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args.out_num = output->dims()[0];
args.out_depth = output->dims()[1];
args.out_rows = output->dims()[2];
args.out_cols = output->dims()[3];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args.filter_rows = filter.dims()[2];
args.filter_cols = filter.dims()[3];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args.stride_rows = strides[0];
args.stride_cols = strides[1];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args.pad_rows = paddings[0];
args.pad_cols = paddings[1];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl::AclParameters args;
};
template <> template <>
bool ConvKernel<GPU_MALI, float>::Init(const ConvParam &para) const { bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
AclConvOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclConvOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true; return true;
} }
template <> template <>
void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const { void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
// ArmConvImplement imp; std::cout << "init acl" << std::endl;
// imp.Compute(param); AclConvOp<GPU_MALI, float>* acl_op =
param.Output()->mutable_data<float>()[0] = 100.0; reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
} }
template class ConvKernel<GPU_MALI, float>; template class ConvKernel<GPU_MALI, float>;
...@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>; ...@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once
#include "operators/kernel/elementwise_add_kernel.h"
namespace paddle_mobile {
namespace operators {
template <typename T>
struct AddFunctor {
inline T operator()(T a, T b) const { return a + b; }
};
template <>
bool ElementwiseAddKernel<GPU_MALI, float>::Init(
const ElementwiseAddParam &para) const {
return true;
}
template <>
void ElementwiseAddKernel<GPU_MALI, float>::Compute(
const ElementwiseAddParam &param) const {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *Out = param.Out();
Out->mutable_data<float>();
int axis = param.Axis();
ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
AddFunctor<float>(), Out);
}
template class ElementwiseAddKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
return true;
}
template <>
void FusionFcKernel<GPU_MALI, float>::Compute(
const FusionFcParam &param) const {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
const Tensor *input_z = param.InputZ();
auto *input_z_data = input_z->data<float>();
int axis = param.Axis();
Tensor *out = param.Out();
auto *out_data = out->mutable_data<float>();
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
: *input_x;
const Tensor y_matrix =
input_y->dims().size() > 2
? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
: *input_y;
auto out_dim = out->dims();
if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
" out_dim.size must be 2.");
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
}
for (int i = 0; i < out->numel(); i++) {
DLOG << out_data[i];
}
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1));
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LRN_OP
#pragma once
#include "operators/kernel/lrn_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclLrnOp : public acl::ACLOperator {
public:
AclLrnOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
}
~AclLrnOp() = default;
AclLrnOp(const AclLrnOp&) = delete;
AclLrnOp& operator=(const AclLrnOp&) = delete;
AclLrnOp(AclLrnOp&&) = delete;
AclLrnOp& operator=(AclLrnOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const LrnParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
if (is_operator_init_done(shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
arm_compute::NormalizationLayerInfo norm_info(
arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
args.knorm);
//[width, height, IFM]
new_tensor(input(), shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), shape, args.output_data);
acl_configure(lrn, this, norm_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const LrnParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const LrnParam& param) {
const Tensor* in_x = param.InputX();
Tensor* out = param.Out();
int n = param.N();
T alpha = param.Alpha();
T beta = param.Beta();
T k = param.K();
const T* input_data = in_x->data<T>();
T* output_data = out->mutable_data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.nsize = n;
args.alpha = alpha;
args.beta = beta;
args.knorm = k;
// NCHW
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.in_rows = in_x->dims()[2];
args.in_cols = in_x->dims()[3];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl::AclParameters args;
};
template <>
bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
AclLrnOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclLrnOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
std::cout << "init acl" << std::endl;
AclLrnOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
for (int n = 0; n < args.batch; ++n) {
acl_op->RunAcl((void*)input_data, (void*)output_data);
input_data += args.in_depth * args.in_cols * args.in_rows;
output_data += args.in_depth * args.in_cols * args.in_rows;
}
}
template class LrnKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#pragma once
#include "operators/kernel/mul_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
return true;
}
template <>
void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *out = param.Out();
out->mutable_data<float>();
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
: *input_x;
const Tensor y_matrix =
input_y->dims().size() > 2
? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
: *input_y;
auto out_dim = out->dims();
if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0));
if (out_dim.size() != 2) {
out->Resize(out_dim);
}
}
template class MulKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "operators/kernel/pool_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclPoolOp : public acl::ACLOperator {
public:
AclPoolOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
}
~AclPoolOp() = default;
AclPoolOp(const AclPoolOp&) = delete;
AclPoolOp& operator=(const AclPoolOp&) = delete;
AclPoolOp(AclPoolOp&&) = delete;
AclPoolOp& operator=(AclPoolOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const PoolParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::PoolingLayerInfo pool_info;
if (args.pool_type == "max") {
pool_info = arm_compute::PoolingLayerInfo(
arm_compute::PoolingType::MAX, args.filter_rows,
arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::CEIL));
} else {
pool_info = arm_compute::PoolingLayerInfo(
arm_compute::PoolingType::AVG, args.filter_rows,
arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::CEIL));
}
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(pooling, this, pool_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const PoolParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
if (args.pool_type != "max" && args.pool_type != "avg") {
bypass_acl = true;
}
if (args.filter_rows != args.filter_cols) {
bypass_acl = true;
}
// if (args.filter_rows!=2 && args.filter_rows!=3) {
// bypass_acl = true;
// }
return bypass_acl;
}
private:
void AclParametersByContext(const PoolParam& param) {
const Tensor* in_x = param.Input();
Tensor* out = param.Output();
std::string pooling_type = param.PoolingType();
std::vector<int> ksize = param.Ksize();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
bool is_global_pooling = param.isGlobalPooling();
const T* input_data = in_x->data<T>();
T* output_data = out->mutable_data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.is_global_pool = is_global_pooling;
args.pool_type = pooling_type;
args.filter_rows = ksize[0];
args.filter_cols = ksize[1];
args.dim = ksize.size();
// NCHW
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.in_rows = in_x->dims()[2];
args.in_cols = in_x->dims()[3];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
// std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
// << " I: " << static_cast<int>(filter->dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args.stride_rows = strides[0];
args.stride_cols = strides[1];
// std::cout <<"PoolingType: " << args.pool_type << "\n";
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args.pad_rows = paddings[0];
args.pad_cols = paddings[1];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
args.out_depth = args.in_depth;
// args.out_rows = out->dims()[2];
// args.out_cols = out->dims()[3];
args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
2 * args.pad_rows -
args.filter_rows) /
args.stride_rows)) +
1;
args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
2 * args.pad_cols -
args.filter_cols) /
args.stride_cols)) +
1;
if (is_global_pooling) {
args.filter_rows = args.in_rows;
args.filter_cols = args.in_cols;
args.pad_rows = 0;
args.pad_cols = 0;
}
}
acl::AclParameters args;
};
template <>
bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
AclPoolOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclPoolOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
std::cout << "init acl" << std::endl;
AclPoolOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
for (int n = 0; n < args.batch; ++n) {
acl_op->RunAcl((void*)input_data, (void*)output_data);
input_data += args.in_depth * args.in_cols * args.in_rows;
output_data += args.in_depth * args.out_cols * args.out_rows;
}
}
template class PoolKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RELU_OP
#pragma once
#include "operators/kernel/relu_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclReluOp : public acl::ACLOperator {
public:
AclReluOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
}
~AclReluOp() = default;
AclReluOp(const AclReluOp&) = delete;
AclReluOp& operator=(const AclReluOp&) = delete;
AclReluOp(AclReluOp&&) = delete;
AclReluOp& operator=(AclReluOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const ReluParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
args.in_depth * args.batch);
arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
args.in_depth * args.out_num);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::ActivationLayerInfo::ActivationFunction type;
type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
arm_compute::ActivationLayerInfo act_info(type);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(activation, this, act_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const ReluParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const ReluParam& param) {
const auto* input_x = param.InputX();
auto* out = param.Out();
const T* input_data = input_x->data<T>();
T* output_data = out->mutable_data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.batch = input_x->dims()[0];
args.in_depth = input_x->dims()[1];
args.in_rows = input_x->dims()[2];
args.in_cols = input_x->dims()[3];
args.out_num = out->dims()[0];
}
acl::AclParameters args;
};
template <>
bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
AclReluOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclReluOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
std::cout << "init acl" << std::endl;
AclReluOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
}
template class ReluKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE_OP
#pragma once
#include "operators/kernel/reshape_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
return true;
}
template <>
void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
const auto *input_x = param.InputX();
const auto &input_x_dims = input_x->dims();
auto *out = param.Out();
framework::DDim out_dims = out->dims();
const auto *input_shape = param.InputShape();
if (input_shape) {
auto *shape_data = input_shape->data<int>();
framework::Tensor cpu_shape_tensor;
auto shape =
std::vector<int>(shape_data, shape_data + input_shape->numel());
out_dims = ValidateShape(shape, input_x->dims());
}
bool inplace = param.Inplace();
out->Resize(out_dims);
if (!inplace) {
out->mutable_data<float>();
framework::TensorCopy(*input_x, out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*input_x);
out->Resize(out_dims);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once
#include "operators/kernel/softmax_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclSoftmaxOp : public acl::ACLOperator {
public:
AclSoftmaxOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
}
~AclSoftmaxOp() = default;
AclSoftmaxOp(const AclSoftmaxOp&) = delete;
AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
AclSoftmaxOp(AclSoftmaxOp&&) = delete;
AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const SoftmaxParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape shape(args.in_depth, args.batch);
if (is_operator_init_done(shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
//[width, height, IFM]
new_tensor(input(), shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), shape, args.output_data);
acl_configure(softmax, this, NULL);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const SoftmaxParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const SoftmaxParam& param) {
const framework::Tensor* in_x = param.InputX();
framework::Tensor* out = param.Out();
auto x_dims = in_x->dims();
out->Resize(x_dims);
const T* input_data = in_x->data<T>();
T* output_data = out->data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
// NCHW
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.out_num = out->dims()[0];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl::AclParameters args;
};
template <>
bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
AclSoftmaxOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclSoftmaxOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
std::cout << "init acl" << std::endl;
AclSoftmaxOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
for (int n = 0; n < args.out_num; ++n) {
acl_op->RunAcl((void*)input_data, (void*)output_data);
input_data += args.in_depth;
output_data += args.in_depth;
}
}
template class SoftmaxKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
...@@ -34,6 +34,8 @@ USE_OP_CPU(lrn); ...@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
REGISTER_OPERATOR_CPU(lrn, ops::LrnOp); REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(lrn);
REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/depthwiseconv3x3s1p1.h"
#include <arm_neon.h>
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
Tensor bias, bool if_bias) {
const float *input_data = input->data<float>();
const float *filter_data = filter.data<float>();
float *output_data = output->data<float>();
const float *bias_data = bias.data<float>();
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
const int l = h;
const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]);
const int hxw = h * w;
float32x4_t vbias = vdupq_n_f32(0.0);
for (int b = 0; b < batch_size; ++b) {
const float *filter_data_tmp = filter_data;
for (int j = 0; j < c; ++j) {
if (if_bias) {
vbias = vdupq_n_f32(bias_data[j]);
}
int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0
float w00 = filter_data_tmp[0];
float w01 = filter_data_tmp[1];
float w02 = filter_data_tmp[2];
float w10 = filter_data_tmp[3];
float w11 = filter_data_tmp[4];
float w12 = filter_data_tmp[5];
float w20 = filter_data_tmp[6];
float w21 = filter_data_tmp[7];
float w22 = filter_data_tmp[8];
output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1] +
bias_data[j];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
w20 * input_data[2 * l - 2] +
w21 * input_data[2 * l - 1] + bias_data[j];
output_data[(l - 1) * l] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
bias_data[j];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
w01 * input_data[(l - 2) * (l + 1) + 1] +
w10 * input_data[l * l - 2] +
w11 * input_data[l * l - 1] + bias_data[j];
for (int i = 1; i < l - 1; ++i) {
output_data[i * l] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
bias_data[j];
output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
w01 * input_data[i * l + l - 1 - l] +
w10 * input_data[i * l + l - 1 - 1] +
w11 * input_data[i * l + l - 1] +
w20 * input_data[i * l + l - 1 + l - 1] +
w21 * input_data[i * l + l - 1 + l] +
bias_data[j];
}
// top 1 row and bottom 1 row
const float *input_tmp = input_data;
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
tmp3, tmp4, tmp5, out0;
in0 = vld1q_f32(input_tmp);
in2 = vld1q_f32(input_tmp + l);
const float *input_tmp_end = input_tmp + (l - 2) * l;
in4 = vld1q_f32(input_tmp_end);
in6 = vld1q_f32(input_tmp_end + l);
int c_mid = l_mid;
auto output_ptr = output_data + 1;
for (; c_mid > 3; c_mid -= 4) {
in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + l + 4);
tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2);
tmp2 = vextq_f32(in2, in3, 1);
tmp3 = vextq_f32(in2, in3, 2);
out0 = vmulq_n_f32(in0, w10);
out0 = vmlaq_n_f32(out0, tmp0, w11);
out0 = vmlaq_n_f32(out0, tmp1, w12);
out0 = vmlaq_n_f32(out0, in2, w20);
out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
vst1q_f32(output_ptr, out0);
in5 = vld1q_f32(input_tmp_end + 4);
in7 = vld1q_f32(input_tmp_end + l + 4);
tmp0 = vextq_f32(in4, in5, 1);
tmp1 = vextq_f32(in4, in5, 2);
tmp2 = vextq_f32(in6, in7, 1);
tmp3 = vextq_f32(in6, in7, 2);
out0 = vmulq_n_f32(in4, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in6, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
vst1q_f32(output_ptr + (l - 1) * l, out0);
// can optimize to each 8 stride.
input_tmp += 4;
input_tmp_end += 4;
output_ptr += 4;
in0 = in1;
in2 = in3;
in4 = in5;
in6 = in7;
}
// top right pad
float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2);
tmp2 = vextq_f32(in2, pad1, 1);
tmp3 = vextq_f32(in2, pad1, 2);
out0 = vmulq_n_f32(in0, w10);
out0 = vmlaq_n_f32(out0, tmp0, w11);
out0 = vmlaq_n_f32(out0, tmp1, w12);
out0 = vmlaq_n_f32(out0, in2, w20);
out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + i, out0, 2);
}
}
// bottom right pad
float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2);
tmp2 = vextq_f32(in6, pad3, 1);
tmp3 = vextq_f32(in6, pad3, 2);
out0 = vmulq_n_f32(in4, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in6, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
}
}
// mid
for (int i = 0; i < l - 2; ++i) {
auto output_ptr = output_data + (i + 1) * l + 1;
input_tmp = input_data + i * l;
auto in0_tmp = vld1q_f32(input_tmp);
auto in2_tmp = vld1q_f32(input_tmp + l);
auto in4_tmp = vld1q_f32(input_tmp + l + l);
c_mid = l_mid;
for (; c_mid > 3; c_mid -= 4) {
auto in1_tmp = vld1q_f32(input_tmp + 4);
auto in3_tmp = vld1q_f32(input_tmp + l + 4);
auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
out0 = vmulq_n_f32(in0_tmp, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in2_tmp, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vmlaq_n_f32(out0, in4_tmp, w20);
out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
vst1q_f32(output_ptr, out0);
output_ptr += 4;
input_tmp += 4;
in0_tmp = in1_tmp;
in2_tmp = in3_tmp;
in4_tmp = in5_tmp;
}
float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
tmp0 = vextq_f32(in0_tmp, pad0, 1);
tmp1 = vextq_f32(in0_tmp, pad0, 2);
tmp2 = vextq_f32(in2_tmp, pad1, 1);
tmp3 = vextq_f32(in2_tmp, pad1, 2);
tmp4 = vextq_f32(in4_tmp, pad2, 1);
tmp5 = vextq_f32(in4_tmp, pad2, 2);
out0 = vmulq_n_f32(in0_tmp, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in2_tmp, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vmlaq_n_f32(out0, in4_tmp, w20);
out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + i, out0, 2);
}
}
}
output_data += hxw;
input_data += hxw;
filter_data_tmp += 9;
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
Tensor bias, bool if_bias);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
...@@ -60,6 +60,8 @@ USE_OP_CPU(mul); ...@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
REGISTER_OPERATOR_CPU(mul, ops::MulOp); REGISTER_OPERATOR_CPU(mul, ops::MulOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(mul);
REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d); ...@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp); REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(pool2d);
REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -38,6 +38,8 @@ USE_OP_CPU(relu); ...@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
REGISTER_OPERATOR_CPU(relu, ops::ReluOp); REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(relu);
REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -37,6 +37,8 @@ USE_OP_CPU(reshape); ...@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp); REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(reshape);
REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -32,6 +32,8 @@ USE_OP_CPU(softmax); ...@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp); REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(softmax);
REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
#!/usr/bin/env sh #!/usr/bin/env sh
push_fn () { push_fn () {
MODELS_PATH="../test/models/*" MODELS_PATH="../../test/models/*"
MODELS_SRC="../test/models" MODELS_SRC="../../test/models"
IMAGE_PATH="../test/images/*" IMAGE_PATH="../../test/images/*"
EXE_FILE="../test/build/*" EXE_FILE="../../test/build/*"
EXE_DIR="data/local/tmp/bin" EXE_DIR="data/local/tmp/bin"
adb shell mkdir ${EXE_DIR} adb shell mkdir ${EXE_DIR}
MODELS_DIR="data/local/tmp/models" MODELS_DIR="data/local/tmp/models"
...@@ -14,9 +14,14 @@ do ...@@ -14,9 +14,14 @@ do
adb shell mkdir ${MODELS_DIR}"/"${file} adb shell mkdir ${MODELS_DIR}"/"${file}
done done
if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then
ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
adb push ${ACL_BUILD_PATH} ${EXE_DIR}
fi
IMAGES_DIR="data/local/tmp/images" IMAGES_DIR="data/local/tmp/images"
adb shell mkdir ${IMAGES_DIR} adb shell mkdir ${IMAGES_DIR}
LIB_PATH="../build/release/arm-v7a/build/*" LIB_PATH="../../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR} adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR} adb push ${LIB_PATH} ${EXE_DIR}
if [[ $1 != "npm" ]]; then if [[ $1 != "npm" ]]; then
......
...@@ -56,7 +56,7 @@ build_for_android() { ...@@ -56,7 +56,7 @@ build_for_android() {
MODE="Release" MODE="Release"
ANDROID_PLATFORM_VERSION="android-15" ANDROID_PLATFORM_VERSION="android-22"
TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE="arm" ANDROID_ARM_MODE="arm"
if [ $# -eq 1 ]; then if [ $# -eq 1 ]; then
......
#!/usr/bin/env sh
# auto build and run
BUILDNET="mobilenetssd"
TESTUNIT="test-mobilenetssd"
push_fn () {
sh build.sh android ${BUILDNET}
MODELS_PATH="../test/models/*"
MODELS_SRC="../test/models"
IMAGE_PATH="../test/images/*"
EXE_FILE="../test/build/*"
EXE_DIR="data/local/tmp/bin"
adb shell mkdir ${EXE_DIR}
MODELS_DIR="data/local/tmp/models"
adb shell mkdir ${MODELS_DIR}
for file in `ls ${MODELS_SRC}`
do
adb shell mkdir ${MODELS_DIR}"/"${file}
done
IMAGES_DIR="data/local/tmp/images"
adb shell mkdir ${IMAGES_DIR}
LIB_PATH="../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR}
if [[ $1 != "npm" ]]; then
adb push ${IMAGE_PATH} ${IMAGES_DIR}
adb push ${MODELS_PATH} ${MODELS_DIR}
fi
adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${TESTUNIT}"
}
if [[ $1 == "npm" ]]; then
push_fn $1
else
push_fn
fi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册