提交 a867dbbf 编写于 作者: C chonwhite

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into fpga_pr

...@@ -32,10 +32,9 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ...@@ -32,10 +32,9 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
$ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib $ENV{CUDNN_ROOT}/lib
/usr/lib /usr/lib
${CUDA_TOOLKIT_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
)
if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0)) if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
find_library(CUBLAS_LIBRARY NAMES libcublas.so PATHS ${CUDNN_CHECK_LIBRARY_DIRS} NO_DEFAULT_PATH) find_library(CUBLAS_LIBRARY NAMES libcublas.so PATHS ${CUDNN_CHECK_LIBRARY_DIRS} NO_DEFAULT_PATH)
......
...@@ -46,7 +46,6 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -46,7 +46,6 @@ void OutputOptModel(const std::string& load_model_dir,
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
std::vector<Place> vaild_places = { std::vector<Place> vaild_places = {
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)},
}; };
if (FLAGS_is_quantized_model) { if (FLAGS_is_quantized_model) {
vaild_places.insert(vaild_places.begin(), vaild_places.insert(vaild_places.begin(),
......
...@@ -47,7 +47,6 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -47,7 +47,6 @@ void OutputOptModel(const std::string& load_model_dir,
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
config.set_valid_places({ config.set_valid_places({
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
}); });
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
......
...@@ -153,7 +153,7 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -153,7 +153,7 @@ class LITE_API CxxConfig : public ConfigBase {
std::string param_file() const { return param_file_; } std::string param_file() const { return param_file_; }
bool model_from_memory() const { return model_from_memory_; } bool model_from_memory() const { return model_from_memory_; }
void set_cpu_math_library_math_threads(int threads) { void set_cpu_math_library_num_threads(int threads) {
cpu_math_library_math_threads_ = threads; cpu_math_library_math_threads_ = threads;
} }
int cpu_math_library_num_threads() const { int cpu_math_library_num_threads() const {
......
...@@ -31,9 +31,11 @@ USE_MIR_PASS(lite_fc_fuse_pass); ...@@ -31,9 +31,11 @@ USE_MIR_PASS(lite_fc_fuse_pass);
USE_MIR_PASS(lite_shuffle_channel_fuse_pass); USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
USE_MIR_PASS(lite_interpolate_fuse_pass); USE_MIR_PASS(lite_interpolate_fuse_pass);
USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
USE_MIR_PASS(identity_scale_eliminate_pass); USE_MIR_PASS(identity_scale_eliminate_pass);
USE_MIR_PASS(lite_conv_elementwise_fuse_pass); USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
USE_MIR_PASS(lite_conv_activation_fuse_pass); USE_MIR_PASS(lite_conv_activation_fuse_pass);
USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass); USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
USE_MIR_PASS(lite_quant_dequant_fuse_pass); USE_MIR_PASS(lite_quant_dequant_fuse_pass);
USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_precision_cast_pass);
......
...@@ -30,7 +30,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) { ...@@ -30,7 +30,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
std::string model_dir = FLAGS_model_dir; std::string model_dir = FLAGS_model_dir;
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(model_dir); config.set_model_dir(model_dir);
config.set_cpu_math_library_math_threads(10); config.set_cpu_math_library_num_threads(1);
config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)}, config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
......
...@@ -25,7 +25,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout, ...@@ -25,7 +25,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -40,7 +39,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout, ...@@ -40,7 +39,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -55,7 +53,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout, ...@@ -55,7 +53,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -70,7 +67,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout, ...@@ -70,7 +67,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -93,7 +89,6 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -93,7 +89,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
const float *bias, const float *bias,
int pad, int pad,
bool flag_bias, bool flag_bias,
bool flag_relu,
const operators::ActivationParam act_param, const operators::ActivationParam act_param,
ARMContext *ctx) { ARMContext *ctx) {
if (pad == 0) { if (pad == 0) {
...@@ -103,7 +98,6 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -103,7 +98,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
...@@ -118,7 +112,6 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -118,7 +112,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
...@@ -136,7 +129,6 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -136,7 +129,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
...@@ -151,7 +143,6 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -151,7 +143,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
...@@ -163,7 +154,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -163,7 +154,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
} }
} }
} }
// clang-format on
#ifdef __aarch64__ #ifdef __aarch64__
#define INIT_S1 \ #define INIT_S1 \
"PRFM PLDL1KEEP, [%[din_ptr0]] \n" \ "PRFM PLDL1KEEP, [%[din_ptr0]] \n" \
...@@ -2318,7 +2309,6 @@ void act_switch_3x3s1p1(const float *din_ptr0, ...@@ -2318,7 +2309,6 @@ void act_switch_3x3s1p1(const float *din_ptr0,
} }
} }
#endif #endif
// clang-format on
/** /**
* \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
* width > 4 * width > 4
...@@ -2328,7 +2318,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout, ...@@ -2328,7 +2318,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -2857,7 +2846,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout, ...@@ -2857,7 +2846,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -3443,7 +3431,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout, ...@@ -3443,7 +3431,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -3579,129 +3566,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout, ...@@ -3579,129 +3566,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
} }
int cnt = tile_w; int cnt = tile_w;
/*
if (flag_relu) {
asm volatile(
INIT_S1
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" // vld1q_f32(din_ptr0)
"ld1 {v10.4s}, [%[din_ptr5]], #16 \n" // vld1q_f32(din_ptr0)
"ext v16.16b, v0.16b, v1.16b, #4 \n" // v16 = 1234
"ext v17.16b, v0.16b, v1.16b, #8 \n" // v17 = 2345
"ld1 {v9.4s}, [%[din_ptr4]] \n" // vld1q_f32(din_ptr0)
"ld1 {v11.4s}, [%[din_ptr5]] \n" // vld1q_f32(din_ptr0)
MID_COMPUTE_S1 MID_RESULT_S1_RELU
"cmp %w[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1
RIGHT_RESULT_S1_RELU "0: \n"
: [cnt] "+r"(cnt),
[din_ptr0] "+r"(din_ptr0),
[din_ptr1] "+r"(din_ptr1),
[din_ptr2] "+r"(din_ptr2),
[din_ptr3] "+r"(din_ptr3),
[din_ptr4] "+r"(din_ptr4),
[din_ptr5] "+r"(din_ptr5),
[doutr0] "+r"(doutr0),
[doutr1] "+r"(doutr1),
[doutr2] "+r"(doutr2),
[doutr3] "+r"(doutr3)
: [w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[bias_val] "r"(vbias),
[vmask] "r"(vmask),
[rmask] "r"(rmask),
[vzero] "w"(vzero),
[remain] "r"(remain)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25");
} else {
asm volatile(
INIT_S1
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" // vld1q_f32(din_ptr0)
"ld1 {v10.4s}, [%[din_ptr5]], #16 \n" // vld1q_f32(din_ptr0)
"ext v16.16b, v0.16b, v1.16b, #4 \n" // v16 = 1234
"ext v17.16b, v0.16b, v1.16b, #8 \n" // v17 = 2345
"ld1 {v9.4s}, [%[din_ptr4]] \n" // vld1q_f32(din_ptr0)
"ld1 {v11.4s}, [%[din_ptr5]] \n" // vld1q_f32(din_ptr0)
MID_COMPUTE_S1 MID_RESULT_S1
"cmp %w[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1
RIGHT_RESULT_S1 "0: \n"
: [cnt] "+r"(cnt),
[din_ptr0] "+r"(din_ptr0),
[din_ptr1] "+r"(din_ptr1),
[din_ptr2] "+r"(din_ptr2),
[din_ptr3] "+r"(din_ptr3),
[din_ptr4] "+r"(din_ptr4),
[din_ptr5] "+r"(din_ptr5),
[doutr0] "+r"(doutr0),
[doutr1] "+r"(doutr1),
[doutr2] "+r"(doutr2),
[doutr3] "+r"(doutr3)
: [w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[bias_val] "r"(vbias),
[vmask] "r"(vmask),
[rmask] "r"(rmask),
[vzero] "w"(vzero),
[remain] "r"(remain)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25");
}
*/
act_switch_3x3s1p0(din_ptr0, act_switch_3x3s1p0(din_ptr0,
din_ptr1, din_ptr1,
din_ptr2, din_ptr2,
...@@ -3760,90 +3624,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout, ...@@ -3760,90 +3624,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
int cnt = tile_w; int cnt = tile_w;
unsigned int *rmask_ptr = rmask; unsigned int *rmask_ptr = rmask;
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
/*
if (flag_relu) {
asm volatile(INIT_S1
"sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
"vext.32 q6, q8, q9, #1 @ 0012\n"
"vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1
MID_RESULT_S1_RELU
"cmp %[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1
RIGHT_RESULT_S1_RELU "0: \n"
: [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0),
[din1_ptr] "+r"(din_ptr1),
[din2_ptr] "+r"(din_ptr2),
[din3_ptr] "+r"(din_ptr3),
[cnt] "+r"(cnt),
[rmask] "+r"(rmask_ptr),
[vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias_val] "r"(bias_val),
[vzero] "w"(vzero),
[remain] "r"(remain)
: "cc",
"memory",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
} else {
asm volatile(INIT_S1
"sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
"sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
"vext.32 q6, q8, q9, #1 @ 0012\n"
"vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1
MID_RESULT_S1
"cmp %[remain], #1 \n"
"blt 0f \n" RIGHT_COMPUTE_S1
RIGHT_RESULT_S1 "0: \n"
: [dout_ptr1] "+r"(doutr0),
[dout_ptr2] "+r"(doutr1),
[din0_ptr] "+r"(din_ptr0),
[din1_ptr] "+r"(din_ptr1),
[din2_ptr] "+r"(din_ptr2),
[din3_ptr] "+r"(din_ptr3),
[cnt] "+r"(cnt),
[rmask] "+r"(rmask_ptr),
[vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias_val] "r"(bias_val),
[vzero] "w"(vzero),
[remain] "r"(remain)
: "cc",
"memory",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}*/
act_switch_3x3s1p0(din_ptr0, act_switch_3x3s1p0(din_ptr0,
din_ptr1, din_ptr1,
din_ptr2, din_ptr2,
...@@ -4174,7 +3954,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout, ...@@ -4174,7 +3954,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
const float *weights, const float *weights,
const float *bias, const float *bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
...@@ -4213,14 +3992,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout, ...@@ -4213,14 +3992,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
float32x4_t wr1 = vld1q_f32(weight_ptr + 3); float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
float32x4_t wr2 = vld1q_f32(weight_ptr + 6); float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
// #ifdef __aarch64__
// float32x4_t wbias;
// if (flag_bias) {
// wbias = vdupq_n_f32(bias[i]);
// } else {
// wbias = vdupq_n_f32(0.f);
// }
// #endif // __aarch64__
float32x4_t wbias; float32x4_t wbias;
float bias_val = 0.f; float bias_val = 0.f;
if (flag_bias) { if (flag_bias) {
...@@ -4261,137 +4032,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout, ...@@ -4261,137 +4032,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
break; break;
} }
} }
/*
#ifdef __aarch64__
if (flag_relu) {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[vbias] "w"(wbias),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[vzero] "w"(vzero),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
} else {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[vbias] "w"(wbias),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[vzero] "w"(vzero),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
}
#else
unsigned int *vmask_ptr = vmask;
float bias_val = flag_bias ? bias[i] : 0.f;
if (flag_relu) {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3),
[vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[vzero] "w"(vzero),
[bias_val] "r"(bias_val),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "cc",
"memory",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
} else {
asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
: [din0] "+r"(dr0),
[din1] "+r"(dr1),
[din2] "+r"(dr2),
[din3] "+r"(dr3),
[vmask] "+r"(vmask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[vzero] "w"(vzero),
[bias_val] "r"(bias_val),
[out1] "r"(out_buf1),
[out2] "r"(out_buf2)
: "cc",
"memory",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}
#endif
*/
unsigned int *vmask_ptr = vmask; unsigned int *vmask_ptr = vmask;
act_switch_3x3s1p0_s(dr0, act_switch_3x3s1p0_s(dr0,
dr1, dr1,
......
...@@ -836,7 +836,6 @@ void conv_3x3s1_depthwise_fp32(const float* i_data, ...@@ -836,7 +836,6 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
ctx->ExtendWorkspace(sizeof(float) * workspace_size); ctx->ExtendWorkspace(sizeof(float) * workspace_size);
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr; bool flag_bias = param.bias != nullptr;
/// get workspace /// get workspace
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <arm_neon.h> #include <arm_neon.h>
#include "lite/backends/arm/math/conv_block_utils.h"
#include "lite/backends/arm/math/conv_depthwise.h" #include "lite/backends/arm/math/conv_depthwise.h"
namespace paddle { namespace paddle {
...@@ -24,13 +25,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout, ...@@ -24,13 +25,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s2p0_bias_s(float* dout, void conv_depthwise_3x3s2p0_bias_s(float* dout,
...@@ -38,13 +39,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout, ...@@ -38,13 +39,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s2p1_bias(float* dout, void conv_depthwise_3x3s2p1_bias(float* dout,
...@@ -52,13 +53,13 @@ void conv_depthwise_3x3s2p1_bias(float* dout, ...@@ -52,13 +53,13 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s2p1_bias_s(float* dout, void conv_depthwise_3x3s2p1_bias_s(float* dout,
...@@ -66,13 +67,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout, ...@@ -66,13 +67,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s2_fp32(const float* din, void conv_depthwise_3x3s2_fp32(const float* din,
...@@ -88,7 +89,7 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -88,7 +89,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
const float* bias, const float* bias,
int pad, int pad,
bool flag_bias, bool flag_bias,
bool flag_relu, const operators::ActivationParam act_param,
ARMContext* ctx) { ARMContext* ctx) {
if (pad == 0) { if (pad == 0) {
if (w_in > 7) { if (w_in > 7) {
...@@ -97,13 +98,13 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -97,13 +98,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
w_in, w_in,
h_out, h_out,
w_out, w_out,
act_param,
ctx); ctx);
} else { } else {
conv_depthwise_3x3s2p0_bias_s(dout, conv_depthwise_3x3s2p0_bias_s(dout,
...@@ -111,13 +112,13 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -111,13 +112,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
w_in, w_in,
h_out, h_out,
w_out, w_out,
act_param,
ctx); ctx);
} }
} }
...@@ -128,13 +129,13 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -128,13 +129,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
w_in, w_in,
h_out, h_out,
w_out, w_out,
act_param,
ctx); ctx);
} else { } else {
conv_depthwise_3x3s2p1_bias_s(dout, conv_depthwise_3x3s2p1_bias_s(dout,
...@@ -142,13 +143,13 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -142,13 +143,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
weights, weights,
bias, bias,
flag_bias, flag_bias,
flag_relu,
num, num,
ch_in, ch_in,
h_in, h_in,
w_in, w_in,
h_out, h_out,
w_out, w_out,
act_param,
ctx); ctx);
} }
} }
...@@ -412,6 +413,83 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -412,6 +413,83 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"and v17.16b, %[vbias].16b, %[vbias].16b \n" \ "and v17.16b, %[vbias].16b, %[vbias].16b \n" \
\ \
"blt 1f \n" "blt 1f \n"
#define LEFT_RESULT_S2_RELU6 \
"fmax v16.4s, v16.4s, %[vzero].4s \n" \
"ld1 {v22.4s}, [%[six_ptr]] \n" \
\
"ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \
"ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \
"ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \
\
"fadd v17.4s, v17.4s, v13.4s \n" \
"fmin v16.4s, v16.4s, v22.4s \n" \
\
"ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \
"ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \
"ld1 {v15.4s}, [%[inptr0]] \n" \
\
"fadd v17.4s, v17.4s, v14.4s \n" \
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
\
"ld1 {v18.4s}, [%[inptr1]] \n" \
"ld1 {v19.4s}, [%[inptr2]] \n" \
\
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
\
"and v16.16b, %[vbias].16b, %[vbias].16b \n" \
"fmax v17.4s, v17.4s, %[vzero].4s \n" \
\
"ld1 {v20.4s}, [%[inptr3]] \n" \
"ld1 {v21.4s}, [%[inptr4]] \n" \
\
"fmin v17.4s, v17.4s, v22.4s \n" \
\
"cmp %w[cnt], #1 \n" \
\
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
"and v17.16b, %[vbias].16b, %[vbias].16b \n" \
\
"blt 1f \n"
#define LEFT_RESULT_S2_LEAKY_RELU \
"ld1 {v22.4s}, [%[scale_ptr]] \n" \
"cmhs v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
\
"ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \
"ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \
"ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \
\
"fmul v12.4s, v16.4s, v22.4s \n" \
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \
"ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \
"ld1 {v15.4s}, [%[inptr0]] \n" \
\
"fadd v17.4s, v17.4s, v14.4s \n" \
"bif v16.16b, v12.16b, v11.16b \n" /* choose*/ \
\
"ld1 {v18.4s}, [%[inptr1]] \n" \
"ld1 {v19.4s}, [%[inptr2]] \n" \
\
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"cmhs v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v16.4s, v22.4s \n" \
\
"ld1 {v20.4s}, [%[inptr3]] \n" \
"ld1 {v21.4s}, [%[inptr4]] \n" \
\
"and v16.16b, %[vbias].16b, %[vbias].16b \n" \
"bif v17.16b, v12.16b, v11.16b \n" /* choose*/ \
\
"cmp %w[cnt], #1 \n" \
\
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
"and v17.16b, %[vbias].16b, %[vbias].16b \n" \
\
"blt 1f \n"
#define MID_RESULT_S2_RELU \ #define MID_RESULT_S2_RELU \
"fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \
...@@ -438,6 +516,58 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -438,6 +516,58 @@ void conv_depthwise_3x3s2_fp32(const float* din,
\ \
"bne 2b \n" "bne 2b \n"
#define MID_RESULT_S2_RELU6 \
"fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \
\
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"ld1 {v19.4s}, [%[inptr2]] \n" \
"ld1 {v20.4s}, [%[inptr3]] \n" \
"ld1 {v21.4s}, [%[inptr4]] \n" \
\
"fmin v16.4s, v16.4s, v22.4s \n" \
\
"fadd v17.4s, v17.4s, v14.4s \n" \
\
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"subs %w[cnt], %w[cnt], #1 \n" \
\
"fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \
"and v16.16b, %[vbias].16b, %[vbias].16b \n" \
"fmin v17.4s, v17.4s, v22.4s \n" \
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
\
"and v17.16b, %[vbias].16b, %[vbias].16b \n" \
\
"bne 2b \n"
#define MID_RESULT_S2_LEAKY_RELU \
"cmhs v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v16.4s, v22.4s \n" \
\
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"ld1 {v19.4s}, [%[inptr2]] \n" \
"ld1 {v20.4s}, [%[inptr3]] \n" \
"ld1 {v21.4s}, [%[inptr4]] \n" \
\
"bif v16.16b, v12.16b, v11.16b \n" /* choose*/ \
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
"cmhs v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v17.4s, v22.4s \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"subs %w[cnt], %w[cnt], #1 \n" \
\
"and v16.16b, %[vbias].16b, %[vbias].16b \n" \
"bif v17.16b, v12.16b, v11.16b \n" /* choose*/ \
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
\
"and v17.16b, %[vbias].16b, %[vbias].16b \n" \
\
"bne 2b \n"
#define RIGHT_RESULT_S2_RELU \ #define RIGHT_RESULT_S2_RELU \
"fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \
\ \
...@@ -456,6 +586,47 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -456,6 +586,47 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"st1 {v17.4s}, [%[outptr1]], #16 \n" \ "st1 {v17.4s}, [%[outptr1]], #16 \n" \
"4: \n" "4: \n"
#define RIGHT_RESULT_S2_RELU6 \
"fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \
\
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"fmin v16.4s, v16.4s, v22.4s \n" \
\
"fadd v17.4s, v17.4s, v14.4s \n" \
\
"bif v16.16b, v0.16b, %[wmask].16b \n" \
\
"fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"fmin v17.4s, v17.4s, v22.4s \n" \
"bif v17.16b, v1.16b, %[wmask].16b \n" \
\
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
"4: \n"
#define RIGHT_RESULT_S2_LEAKY_RELU \
"cmhs v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v16.4s, v22.4s \n" \
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"bif v16.16b, v12.16b, v11.16b \n" /* choose*/ \
\
"fadd v17.4s, v17.4s, v14.4s \n" \
\
"bif v16.16b, v0.16b, %[wmask].16b \n" \
\
"cmhs v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v17.4s, v22.4s \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"bif v17.16b, v12.16b, v11.16b \n" /* choose*/ \
"bif v17.16b, v1.16b, %[wmask].16b \n" \
\
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
"4: \n"
#define COMPUTE_S_S2 \ #define COMPUTE_S_S2 \
"movi v9.4s, #0 \n" \ "movi v9.4s, #0 \n" \
"ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \
...@@ -500,7 +671,6 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -500,7 +671,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"fmax v4.4s, v4.4s, v9.4s \n" \ "fmax v4.4s, v4.4s, v9.4s \n" \
\ \
"st1 {v4.4s}, [%[out]] \n" "st1 {v4.4s}, [%[out]] \n"
#define COMPUTE_S_S2_P0 \ #define COMPUTE_S_S2_P0 \
"movi v9.4s, #0 \n" \ "movi v9.4s, #0 \n" \
"ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \
...@@ -537,7 +707,6 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -537,7 +707,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"fadd v4.4s, v4.4s, v16.4s \n" "fadd v4.4s, v4.4s, v16.4s \n"
#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n" #define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n"
#define RESULT_S_S2_P0_RELU \ #define RESULT_S_S2_P0_RELU \
"fmax v4.4s, v4.4s, v9.4s \n" \ "fmax v4.4s, v4.4s, v9.4s \n" \
"st1 {v4.4s}, [%[out]] \n" "st1 {v4.4s}, [%[out]] \n"
...@@ -682,7 +851,6 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -682,7 +851,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"vst1.32 {d6-d7}, [%[outptr]]! \n" \ "vst1.32 {d6-d7}, [%[outptr]]! \n" \
"cmp %[cnt], #1 \n" \ "cmp %[cnt], #1 \n" \
"blt 1f \n" "blt 1f \n"
#define MID_RESULT_S2_RELU \ #define MID_RESULT_S2_RELU \
"vmax.f32 q3, q3, q9 @ relu \n" \ "vmax.f32 q3, q3, q9 @ relu \n" \
"subs %[cnt], #1 \n" \ "subs %[cnt], #1 \n" \
...@@ -739,7 +907,6 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -739,7 +907,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"vadd.f32 q3, q3, q5 @ add \n" "vadd.f32 q3, q3, q5 @ add \n"
#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n" #define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n"
#define RESULT_S_S2_RELU \ #define RESULT_S_S2_RELU \
"vmax.f32 q3, q3, q9 @ relu\n" \ "vmax.f32 q3, q3, q9 @ relu\n" \
\ \
...@@ -787,13 +954,233 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -787,13 +954,233 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"vadd.f32 q3, q3, q5 @ add \n" "vadd.f32 q3, q3, q5 @ add \n"
#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n" #define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n"
#define RESULT_S_S2_P0_RELU \ #define RESULT_S_S2_P0_RELU \
"vmax.f32 q3, q3, q9 @ relu \n" \ "vmax.f32 q3, q3, q9 @ relu \n" \
"vst1.32 {d6-d7}, [%[out]] \n" "vst1.32 {d6-d7}, [%[out]] \n"
#endif #endif
#ifdef __aarch64__
void act_switch_3x3s2p1(const float* din0_ptr,
const float* din1_ptr,
const float* din2_ptr,
const float* din3_ptr,
const float* din4_ptr,
float* doutr0_ptr,
float* doutr1_ptr,
float32x4_t wr0,
float32x4_t wr1,
float32x4_t wr2,
uint32x4_t vmask_rp1,
uint32x4_t vmask_rp2,
uint32x4_t wmask,
float32x4_t wbias,
float32x4_t vzero,
int cnt,
int cnt_remain,
const operators::ActivationParam act_param) {
bool has_active = act_param.has_active;
if (has_active) {
float tmp = act_param.Relu_clipped_coef;
float ss = act_param.Leaky_relu_alpha;
float vsix[4] = {tmp, tmp, tmp, tmp};
float vscale[4] = {ss, ss, ss, ss};
switch (act_param.active_type) {
case lite_api::ActivationType::kRelu:
asm volatile(
INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
break;
case lite_api::ActivationType::kRelu6:
/* 0 <= din <= 6 */
asm volatile(
INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[six_ptr] "r"(vsix),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22");
break;
case lite_api::ActivationType::kLeakyRelu:
/*din = din >= 0 ? din : din * scale*/
asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[scale_ptr] "r"(vscale),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22");
break;
default:
LOG(FATAL) << "this act_type: "
<< static_cast<int>(act_param.active_type)
<< " fuse not support";
}
} else {
asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
}
}
#endif
/** /**
* \brief depthwise convolution kernel 3x3, stride 2 * \brief depthwise convolution kernel 3x3, stride 2
* w_in > 7 * w_in > 7
...@@ -803,13 +1190,13 @@ void conv_depthwise_3x3s2p1_bias(float* dout, ...@@ -803,13 +1190,13 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx) { ARMContext* ctx) {
int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
int out_pad_idx[4] = {0, 1, 2, 3}; int out_pad_idx[4] = {0, 1, 2, 3};
...@@ -821,7 +1208,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout, ...@@ -821,7 +1208,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
cnt_col++; cnt_col++;
size_right_remain -= 8; size_right_remain -= 8;
} }
int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);
int size_right_pad = w_out * 2 - w_in; int size_right_pad = w_out * 2 - w_in;
...@@ -935,96 +1322,24 @@ void conv_depthwise_3x3s2p1_bias(float* dout, ...@@ -935,96 +1322,24 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
doutr1_ptr = write_ptr; doutr1_ptr = write_ptr;
} }
int cnt = cnt_col; int cnt = cnt_col;
if (flag_relu) { act_switch_3x3s2p1(din0_ptr,
asm volatile( din1_ptr,
INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 din2_ptr,
MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU din3_ptr,
: [inptr0] "+r"(din0_ptr), din4_ptr,
[inptr1] "+r"(din1_ptr), doutr0_ptr,
[inptr2] "+r"(din2_ptr), doutr1_ptr,
[inptr3] "+r"(din3_ptr), wr0,
[inptr4] "+r"(din4_ptr), wr1,
[outptr0] "+r"(doutr0_ptr), wr2,
[outptr1] "+r"(doutr1_ptr), vmask_rp1,
[cnt] "+r"(cnt) vmask_rp2,
: [vzero] "w"(vzero), wmask,
[w0] "w"(wr0), wbias,
[w1] "w"(wr1), vzero,
[w2] "w"(wr2), cnt,
[remain] "r"(cnt_remain), cnt_remain,
[mask1] "w"(vmask_rp1), act_param);
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
} else {
asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
}
doutr0 = doutr0 + 2 * w_out; doutr0 = doutr0 + 2 * w_out;
} }
#else #else
...@@ -1061,65 +1376,37 @@ void conv_depthwise_3x3s2p1_bias(float* dout, ...@@ -1061,65 +1376,37 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
} }
int cnt = cnt_col; int cnt = cnt_col;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
if (flag_relu) { asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
asm volatile( MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 : [din0_ptr] "+r"(din0_ptr),
MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU [din1_ptr] "+r"(din1_ptr),
: [din0_ptr] "+r"(din0_ptr), [din2_ptr] "+r"(din2_ptr),
[din1_ptr] "+r"(din1_ptr), [outptr] "+r"(doutr0_ptr),
[din2_ptr] "+r"(din2_ptr), [cnt] "+r"(cnt),
[outptr] "+r"(doutr0_ptr), [mask_ptr] "+r"(mask_ptr)
[cnt] "+r"(cnt), : [remain] "r"(cnt_remain),
[mask_ptr] "+r"(mask_ptr) [wr0] "w"(wr0),
: [remain] "r"(cnt_remain), [wr1] "w"(wr1),
[wr0] "w"(wr0), [wr2] "w"(wr2),
[wr1] "w"(wr1), [bias] "r"(bias_c)
[wr2] "w"(wr2), : "cc",
[bias] "r"(bias_c) "memory",
: "cc", "q3",
"memory", "q4",
"q3", "q5",
"q4", "q6",
"q5", "q7",
"q6", "q8",
"q7", "q9",
"q8", "q10",
"q9", "q11",
"q10", "q12",
"q11", "q13",
"q12", "q14",
"q13", "q15");
"q14", // do act
"q15"); if (act_param.has_active) {
} else { act_switch_process(doutr0, doutr0, w_out, &act_param);
asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
: [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr),
[outptr] "+r"(doutr0_ptr),
[cnt] "+r"(cnt),
[mask_ptr] "+r"(mask_ptr)
: [remain] "r"(cnt_remain),
[wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias] "r"(bias_c)
: "cc",
"memory",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
} }
doutr0 = doutr0 + w_out; doutr0 = doutr0 + w_out;
} }
...@@ -1136,13 +1423,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout, ...@@ -1136,13 +1423,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx) { ARMContext* ctx) {
int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
int out_pad_idx[4] = {0, 1, 2, 3}; int out_pad_idx[4] = {0, 1, 2, 3};
...@@ -1198,108 +1485,59 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout, ...@@ -1198,108 +1485,59 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
#ifdef __aarch64__ #ifdef __aarch64__
if (flag_relu) { asm volatile(COMPUTE_S_S2 RESULT_S_S2
asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU : [din0_ptr] "+r"(din0_ptr),
: [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
[din1_ptr] "+r"(din1_ptr), [din2_ptr] "+r"(din2_ptr),
[din2_ptr] "+r"(din2_ptr), [mask_ptr] "+r"(mask_ptr)
[mask_ptr] "+r"(mask_ptr) : [wr0] "w"(wr0),
: [wr0] "w"(wr0), [wr1] "w"(wr1),
[wr1] "w"(wr1), [wr2] "w"(wr2),
[wr2] "w"(wr2), [bias] "w"(vbias),
[bias] "w"(vbias), [out] "r"(out_buf)
[out] "r"(out_buf) : "v4",
: "v4", "v5",
"v5", "v6",
"v6", "v7",
"v7", "v8",
"v8", "v9",
"v9", "v10",
"v10", "v11",
"v11", "v12",
"v12", "v13",
"v13", "v14",
"v14", "v15");
"v15");
} else {
asm volatile(COMPUTE_S_S2 RESULT_S_S2
: [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr),
[mask_ptr] "+r"(mask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias] "w"(vbias),
[out] "r"(out_buf)
: "v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
}
#else #else
if (flag_relu) { asm volatile(COMPUTE_S_S2 RESULT_S_S2
asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU : [din0_ptr] "+r"(din0_ptr),
: [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
[din1_ptr] "+r"(din1_ptr), [din2_ptr] "+r"(din2_ptr),
[din2_ptr] "+r"(din2_ptr), [mask_ptr] "+r"(mask_ptr)
[mask_ptr] "+r"(mask_ptr) : [wr0] "w"(wr0),
: [wr0] "w"(wr0), [wr1] "w"(wr1),
[wr1] "w"(wr1), [wr2] "w"(wr2),
[wr2] "w"(wr2), [bias] "r"(bias_c),
[bias] "r"(bias_c), [out] "r"(out_buf)
[out] "r"(out_buf) : "cc",
: "cc", "memory",
"memory", "q3",
"q3", "q4",
"q4", "q5",
"q5", "q6",
"q6", "q7",
"q7", "q8",
"q8", "q9",
"q9", "q10",
"q10", "q11",
"q11", "q12",
"q12", "q13",
"q13", "q14",
"q14", "q15");
"q15");
} else {
asm volatile(COMPUTE_S_S2 RESULT_S_S2
: [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr),
[mask_ptr] "+r"(mask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias] "r"(bias_c),
[out] "r"(out_buf)
: "cc",
"memory",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}
#endif #endif
// do act
if (act_param.has_active) {
act_switch_process(out_buf, out_buf, w_out, &act_param);
}
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*dout_channel++ = out_buf[w]; *dout_channel++ = out_buf[w];
} }
...@@ -1310,6 +1548,269 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout, ...@@ -1310,6 +1548,269 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
} }
} }
#ifdef __aarch64__
void act_switch_3x3s2p0(const float* din0_ptr,
const float* din1_ptr,
const float* din2_ptr,
const float* din3_ptr,
const float* din4_ptr,
float* doutr0_ptr,
float* doutr1_ptr,
float32x4_t wr0,
float32x4_t wr1,
float32x4_t wr2,
uint32x4_t vmask_rp1,
uint32x4_t vmask_rp2,
uint32x4_t wmask,
float32x4_t wbias,
float32x4_t vzero,
int cnt,
int cnt_remain,
const operators::ActivationParam act_param) {
bool has_active = act_param.has_active;
if (has_active) {
float tmp = act_param.Relu_clipped_coef;
float ss = act_param.Leaky_relu_alpha;
float vsix[4] = {tmp, tmp, tmp, tmp};
float vscale[4] = {ss, ss, ss, ss};
switch (act_param.active_type) {
case lite_api::ActivationType::kRelu:
asm volatile(
INIT_S2
"ld1 {v15.4s}, [%[inptr0]] \n"
"ld1 {v18.4s}, [%[inptr1]] \n"
"ld1 {v19.4s}, [%[inptr2]] \n"
"ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2_RELU
"cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2_RELU
"4: \n"
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
break;
case lite_api::ActivationType::kRelu6:
/* 0 <= din <= 6 */
asm volatile(
INIT_S2
"ld1 {v15.4s}, [%[inptr0]] \n"
"ld1 {v18.4s}, [%[inptr1]] \n"
"ld1 {v19.4s}, [%[inptr2]] \n"
"ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2_RELU6
"cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2_RELU6
"4: \n"
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[six_ptr] "r"(vsix),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22");
break;
case lite_api::ActivationType::kLeakyRelu:
/*din = din >= 0 ? din : din * scale*/
asm volatile(
INIT_S2
"ld1 {v15.4s}, [%[inptr0]] \n"
"ld1 {v18.4s}, [%[inptr1]] \n"
"ld1 {v19.4s}, [%[inptr2]] \n"
"ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
"cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2_LEAKY_RELU
"4: \n"
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[six_ptr] "r"(vscale),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22");
break;
default:
LOG(FATAL) << "this act_type: "
<< static_cast<int>(act_param.active_type)
<< " fuse not support";
}
} else {
asm volatile(
INIT_S2
"ld1 {v15.4s}, [%[inptr0]] \n"
"ld1 {v18.4s}, [%[inptr1]] \n"
"ld1 {v19.4s}, [%[inptr2]] \n"
"ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2
"cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2 "4: \n"
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
}
}
#endif
/** /**
* \brief depthwise convolution kernel 3x3, stride 2 * \brief depthwise convolution kernel 3x3, stride 2
*/ */
...@@ -1319,13 +1820,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout, ...@@ -1319,13 +1820,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx) { ARMContext* ctx) {
int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
int out_pad_idx[4] = {0, 1, 2, 3}; int out_pad_idx[4] = {0, 1, 2, 3};
...@@ -1438,117 +1939,24 @@ void conv_depthwise_3x3s2p0_bias(float* dout, ...@@ -1438,117 +1939,24 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
doutr1_ptr = write_ptr; doutr1_ptr = write_ptr;
} }
int cnt = tile_w; int cnt = tile_w;
if (flag_relu) { act_switch_3x3s2p0(din0_ptr,
asm volatile( din1_ptr,
INIT_S2 din2_ptr,
"ld1 {v15.4s}, [%[inptr0]] \n" din3_ptr,
"ld1 {v18.4s}, [%[inptr1]] \n" din4_ptr,
"ld1 {v19.4s}, [%[inptr2]] \n" doutr0_ptr,
"ld1 {v20.4s}, [%[inptr3]] \n" doutr1_ptr,
"ld1 {v21.4s}, [%[inptr4]] \n" wr0,
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} wr1,
MID_COMPUTE_S2 MID_RESULT_S2_RELU wr2,
"cmp %w[remain], #1 \n" vmask_rp1,
"blt 4f \n" RIGHT_COMPUTE_S2 vmask_rp2,
RIGHT_RESULT_S2_RELU wmask,
"4: \n" wbias,
: [inptr0] "+r"(din0_ptr), vzero,
[inptr1] "+r"(din1_ptr), cnt,
[inptr2] "+r"(din2_ptr), cnt_remain,
[inptr3] "+r"(din3_ptr), act_param);
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
} else {
asm volatile(
INIT_S2
"ld1 {v15.4s}, [%[inptr0]] \n"
"ld1 {v18.4s}, [%[inptr1]] \n"
"ld1 {v19.4s}, [%[inptr2]] \n"
"ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2
"cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2
"4: \n"
: [inptr0] "+r"(din0_ptr),
[inptr1] "+r"(din1_ptr),
[inptr2] "+r"(din2_ptr),
[inptr3] "+r"(din3_ptr),
[inptr4] "+r"(din4_ptr),
[outptr0] "+r"(doutr0_ptr),
[outptr1] "+r"(doutr1_ptr),
[cnt] "+r"(cnt)
: [vzero] "w"(vzero),
[w0] "w"(wr0),
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
[vbias] "w"(wbias)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21");
}
doutr0 = doutr0 + 2 * w_out; doutr0 = doutr0 + 2 * w_out;
} }
#else #else
...@@ -1576,64 +1984,36 @@ void conv_depthwise_3x3s2p0_bias(float* dout, ...@@ -1576,64 +1984,36 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
} }
int cnt = tile_w; int cnt = tile_w;
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
if (flag_relu) { asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU RIGHT_RESULT_S2
RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU : [din0_ptr] "+r"(din0_ptr),
: [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
[din1_ptr] "+r"(din1_ptr), [din2_ptr] "+r"(din2_ptr),
[din2_ptr] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr),
[outptr] "+r"(doutr0_ptr), [cnt] "+r"(cnt),
[cnt] "+r"(cnt), [mask_ptr] "+r"(mask_ptr)
[mask_ptr] "+r"(mask_ptr) : [remain] "r"(cnt_remain),
: [remain] "r"(cnt_remain), [wr0] "w"(wr0),
[wr0] "w"(wr0), [wr1] "w"(wr1),
[wr1] "w"(wr1), [wr2] "w"(wr2),
[wr2] "w"(wr2), [bias] "r"(bias_c)
[bias] "r"(bias_c) : "cc",
: "cc", "memory",
"memory", "q3",
"q3", "q4",
"q4", "q5",
"q5", "q6",
"q6", "q7",
"q7", "q8",
"q8", "q9",
"q9", "q10",
"q10", "q11",
"q11", "q12",
"q12", "q13",
"q13", "q14",
"q14", "q15");
"q15"); if (act_param.has_active) {
} else { act_switch_process(doutr0, doutr0, w_out, &act_param);
asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
RIGHT_RESULT_S2
: [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr),
[outptr] "+r"(doutr0_ptr),
[cnt] "+r"(cnt),
[mask_ptr] "+r"(mask_ptr)
: [remain] "r"(cnt_remain),
[wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias] "r"(bias_c)
: "cc",
"memory",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
} }
doutr0 = doutr0 + w_out; doutr0 = doutr0 + w_out;
} }
...@@ -1650,13 +2030,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout, ...@@ -1650,13 +2030,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
const float* weights, const float* weights,
const float* bias, const float* bias,
bool flag_bias, bool flag_bias,
bool flag_relu,
const int num, const int num,
const int ch_in, const int ch_in,
const int h_in, const int h_in,
const int w_in, const int w_in,
const int h_out, const int h_out,
const int w_out, const int w_out,
const operators::ActivationParam act_param,
ARMContext* ctx) { ARMContext* ctx) {
int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
int out_pad_idx[4] = {0, 1, 2, 3}; int out_pad_idx[4] = {0, 1, 2, 3};
...@@ -1718,114 +2098,62 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout, ...@@ -1718,114 +2098,62 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
unsigned int* mask_ptr = dmask; unsigned int* mask_ptr = dmask;
#ifdef __aarch64__ #ifdef __aarch64__
if (flag_relu) { asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU : [din0_ptr] "+r"(din0_ptr),
: [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
[din1_ptr] "+r"(din1_ptr), [din2_ptr] "+r"(din2_ptr),
[din2_ptr] "+r"(din2_ptr), [mask_ptr] "+r"(mask_ptr)
[mask_ptr] "+r"(mask_ptr) : [wr0] "w"(wr0),
: [wr0] "w"(wr0), [wr1] "w"(wr1),
[wr1] "w"(wr1), [wr2] "w"(wr2),
[wr2] "w"(wr2), [bias] "w"(vbias),
[bias] "w"(vbias), [out] "r"(out_buf)
[out] "r"(out_buf) : "cc",
: "cc", "memory",
"memory", "v4",
"v4", "v5",
"v5", "v6",
"v6", "v7",
"v7", "v8",
"v8", "v9",
"v9", "v10",
"v10", "v11",
"v11", "v12",
"v12", "v13",
"v13", "v14",
"v14", "v15",
"v15", "v16");
"v16");
} else {
asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
: [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr),
[mask_ptr] "+r"(mask_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias] "w"(vbias),
[out] "r"(out_buf)
: "cc",
"memory",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16");
}
#else #else
if (flag_relu) { asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU : [din0_ptr] "+r"(din0_ptr),
: [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
[din1_ptr] "+r"(din1_ptr), [din2_ptr] "+r"(din2_ptr)
[din2_ptr] "+r"(din2_ptr) : [wr0] "w"(wr0),
: [wr0] "w"(wr0), [wr1] "w"(wr1),
[wr1] "w"(wr1), [wr2] "w"(wr2),
[wr2] "w"(wr2), [bias] "r"(bias_c),
[bias] "r"(bias_c), [out] "r"(out_buf),
[out] "r"(out_buf), [mask_ptr] "r"(dmask)
[mask_ptr] "r"(dmask) : "cc",
: "cc", "memory",
"memory", "q3",
"q3", "q4",
"q4", "q5",
"q5", "q6",
"q6", "q7",
"q7", "q8",
"q8", "q9",
"q9", "q10",
"q10", "q11",
"q11", "q12",
"q12", "q13",
"q13", "q14",
"q14", "q15");
"q15");
} else {
asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
: [din0_ptr] "+r"(din0_ptr),
[din1_ptr] "+r"(din1_ptr),
[din2_ptr] "+r"(din2_ptr)
: [wr0] "w"(wr0),
[wr1] "w"(wr1),
[wr2] "w"(wr2),
[bias] "r"(bias_c),
[out] "r"(out_buf),
[mask_ptr] "r"(dmask)
: "cc",
"memory",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
}
#endif #endif
if (act_param.has_active) {
act_switch_process(out_buf, out_buf, w_out, &act_param);
}
for (int w = 0; w < w_out; ++w) { for (int w = 0; w < w_out; ++w) {
*dout_channel++ = out_buf[w]; *dout_channel++ = out_buf[w];
} }
......
...@@ -25,6 +25,511 @@ namespace paddle { ...@@ -25,6 +25,511 @@ namespace paddle {
namespace lite { namespace lite {
namespace arm { namespace arm {
namespace math { namespace math {
#ifdef __aarch64__
#define COMPUTE \
"ldr q8, [%[bias]]\n" /* load bias */ \
"ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ \
"and v19.16b, v8.16b, v8.16b\n" \
"ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ \
"and v20.16b, v8.16b, v8.16b\n" \
"ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/ \
"and v21.16b, v8.16b, v8.16b\n" \
"ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/ \
"and v22.16b, v8.16b, v8.16b\n" \
"ldr q8, [%[inr0]]\n" /* load input r0*/ \
"fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/ \
"fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/ \
"fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/ \
"fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/ \
"fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/ \
"ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/ \
"fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/ \
"fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/ \
"fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/ \
"fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/ \
"ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/ \
"fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/ \
"ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/ \
"fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/ \
"ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ \
"fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/ \
"ldr q8, [%[inr1]]\n" /* load input r1*/ \
"fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/ \
"fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/ \
"fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/ \
"fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/ \
"fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/ \
"ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ \
"fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/ \
"fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/ \
"fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/ \
"fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/ \
"ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ \
"fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/ \
"ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/ \
"fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/ \
"ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/ \
"fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/ \
"ldr q8, [%[inr2]]\n" /* load input r2*/ \
"fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/ \
"fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/ \
"fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/ \
"fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/ \
"fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/ \
"fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/ \
"fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/ \
"fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/ \
"fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/ \
"fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/ \
"fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/ \
"fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/ \
"trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/ \
"trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/ \
"trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/ \
"trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/ \
"trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ \
"trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ \
"trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ \
"trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/
#define RELU /* relu */ \
"movi v0.4s, #0\n" /* for relu */ \
"fmax v19.4s, v19.4s, v0.4s\n" \
"fmax v20.4s, v20.4s, v0.4s\n" \
"fmax v21.4s, v21.4s, v0.4s\n" \
"fmax v22.4s, v22.4s, v0.4s\n"
#define RELU6 /* relu6 */ \
"fmin v19.4s, v19.4s, %[vsix].4s\n" \
"fmin v20.4s, v20.4s, %[vsix].4s\n" \
"fmin v21.4s, v21.4s, %[vsix].4s\n" \
"fmin v22.4s, v22.4s, %[vsix].4s\n"
#define LEAKY_RELU /* LeakyRelu */ \
"movi v0.4s, #0\n" /* for relu */ \
"cmhs v1.4s, v19.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v2.4s, v19.4s, %[vscale].4s \n" /* mul */ \
"cmhs v3.4s, v20.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v4.4s, v20.4s, %[vscale].4s \n" /* mul */ \
"cmhs v5.4s, v21.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v6.4s, v21.4s, %[vscale].4s \n" /* mul */ \
"cmhs v7.4s, v22.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \
"bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \
"bif v19.16b, v4.16b, v3.16b \n" /* choose*/ \
"bif v19.16b, v6.16b, v5.16b \n" /* choose*/ \
"bif v19.16b, v8.16b, v7.16b \n" /* choose*/
#define STORE /* save result */ \
"str q19, [%[outc0]], #16\n" \
"str q20, [%[outc1]], #16\n" \
"str q21, [%[outc2]], #16\n" \
"str q22, [%[outc3]], #16\n"
#else
#define COMPUTE \
/* fill with bias */ \
"vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */ /* load weights */ \
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */ \
"vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/ \
"vand.i32 q12, q8, q8\n" \
"vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/ \
"vand.i32 q13, q8, q8\n" \
"vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/ \
"vand.i32 q14, q8, q8\n" \
"vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/ \
"vand.i32 q15, q8, q8\n" \
"vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/ \
"vmla.f32 q12, q9, q0 @ w0 * inr0\n" \
"vmla.f32 q13, q9, q2 @ w0 * inr2\n" \
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */ \
"vmla.f32 q14, q9, q4 @ w0 * inr4\n" \
"vmla.f32 q15, q9, q6 @ w0 * inr6\n" \
"vmla.f32 q12, q10, q1 @ w1 * inr1\n" \
"vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n" \
"vmla.f32 q13, q10, q3 @ w1 * inr3\n" \
"vmla.f32 q14, q10, q5 @ w1 * inr5\n" \
"vmla.f32 q15, q10, q7 @ w1 * inr7\n" \
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */ \
"vmla.f32 q12, q11, q2 @ w2 * inr2\n" \
"vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n" \
"vmla.f32 q13, q11, q4 @ w2 * inr4\n" \
"vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n" \
"vmla.f32 q14, q11, q6 @ w2 * inr6\n" \
"vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n" \
"vmla.f32 q15, q11, q8 @ w2 * inr8\n" /* mul r1 with w3, w4*/ \
"vmla.f32 q12, q9, q0 @ w3 * inr0\n" \
"vmla.f32 q13, q9, q2 @ w3 * inr2\n" \
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */ \
"vmla.f32 q14, q9, q4 @ w3 * inr4\n" \
"vmla.f32 q15, q9, q6 @ w3 * inr6\n" \
"vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/ \
"vmla.f32 q12, q10, q1 @ w4 * inr1\n" \
"vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n" \
"vmla.f32 q13, q10, q3 @ w4 * inr3\n" \
"vmla.f32 q14, q10, q5 @ w4 * inr5\n" \
"vmla.f32 q15, q10, q7 @ w4 * inr7\n" \
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */ \
"vmla.f32 q12, q11, q2 @ w5 * inr2\n" \
"vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n" \
"vmla.f32 q13, q11, q4 @ w5 * inr4\n" \
"vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n" \
"vmla.f32 q14, q11, q6 @ w5 * inr6\n" \
"vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n" \
"vmla.f32 q15, q11, q8 @ w5 * inr8\n" /* mul r2 with w6, w7*/ \
"vmla.f32 q12, q9, q0 @ w6 * inr0\n" \
"vmla.f32 q13, q9, q2 @ w6 * inr2\n" \
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */ \
"vmla.f32 q14, q9, q4 @ w6 * inr4\n" \
"vmla.f32 q15, q9, q6 @ w6 * inr6\n" \
"vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/ \
"vmla.f32 q12, q10, q1 @ w7 * inr1\n" \
"vmla.f32 q13, q10, q3 @ w7 * inr3\n" \
"vmla.f32 q14, q10, q5 @ w7 * inr5\n" \
"vmla.f32 q15, q10, q7 @ w7 * inr7\n" \
"sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" \
"vmla.f32 q12, q11, q2 @ w8 * inr2\n" \
"vmla.f32 q13, q11, q4 @ w8 * inr4\n" \
"vmla.f32 q14, q11, q6 @ w8 * inr6\n" \
"vmla.f32 q15, q11, q8 @ w8 * inr8\n" /* transpose */ \
"vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/ \
"vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/ \
"vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/ \
"vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/
#define RELU /* relu */ \
"vmov.u32 q0, #0\n" \
"vld1.32 {d2-d3}, [%[six_ptr]]\n" \
"vmax.f32 q12, q12, q0\n" \
"vmax.f32 q13, q13, q0\n" \
"vmax.f32 q14, q14, q0\n" \
"vmax.f32 q15, q15, q0\n"
#define RELU6 /* relu6 */ \
"vmin.f32 q12, q12, q1\n" \
"vmin.f32 q13, q13, q1\n" \
"vmin.f32 q14, q14, q1\n" \
"vmin.f32 q15, q15, q1\n"
#define LEAKY_RELU /* LeakyRelu */ \
"vmov.u32 q0, #0\n" \
"vld1.32 {d2-d3}, [%[scale_ptr]]\n" \
"vcge.f32 q2, q12, q0 @ q0 > 0 \n" \
"vcge.f32 q4, q13, q0 @ q0 > 0 \n" \
"vcge.f32 q6, q14, q0 @ q0 > 0 \n" \
"vcge.f32 q8, q15, q0 @ q0 > 0 \n" \
"vmul.f32 q3, q12, q1 @ mul \n" \
"vmul.f32 q5, q13, q1 @ mul \n" \
"vmul.f32 q7, q14, q1 @ mul \n" \
"vmul.f32 q9, q15, q1 @ mul \n" \
"vbif q12, q3, q2 @ choose \n" \
"vbif q13, q5, q4 @ choose \n" \
"vbif q14, q7, q6 @ choose \n" \
"vbif q15, q9, q8 @ choose \n"
#define STORE /* save result */ \
"vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/ \
"vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/ \
"vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/ \
"vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
#endif
void act_switch_3x3s2(const float* inr0,
const float* inr1,
const float* inr2,
float* outc0,
float* outc1,
float* outc2,
float* outc3,
const float* weight_c,
float* bias_local,
float32x4_t w0,
float32x4_t w1,
float32x4_t w2,
float32x4_t w3,
float32x4_t w4,
float32x4_t w5,
float32x4_t w6,
float32x4_t w7,
float32x4_t w8,
const operators::ActivationParam act_param) {
bool has_active = act_param.has_active;
if (has_active) {
float tmp = act_param.Relu_clipped_coef;
float ss = act_param.Leaky_relu_alpha;
#ifdef __aarch64__
float32x4_t vsix = vdupq_n_f32(tmp);
float32x4_t vscale = vdupq_n_f32(ss);
#else
float vsix[4] = {tmp, tmp, tmp, tmp};
float vscale[4] = {ss, ss, ss, ss};
#endif
switch (act_param.active_type) {
case lite_api::ActivationType::kRelu:
#ifdef __aarch64__
asm volatile(COMPUTE RELU STORE
: [inr0] "+r"(inr0),
[inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [w0] "w"(w0),
[w1] "w"(w1),
[w2] "w"(w2),
[w3] "w"(w3),
[w4] "w"(w4),
[w5] "w"(w5),
[w6] "w"(w6),
[w7] "w"(w7),
[w8] "w"(w8),
[bias] "r"(bias_local)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v19",
"v20",
"v21",
"v22");
#else
asm volatile(COMPUTE RELU STORE
: [r0] "+r"(inr0),
[r1] "+r"(inr1),
[r2] "+r"(inr2),
[wc0] "+r"(weight_c),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [bias] "r"(bias_local), [six_ptr] "r"(vsix)
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
#endif
break;
case lite_api::ActivationType::kRelu6:
#ifdef __aarch64__
asm volatile(COMPUTE RELU RELU6 STORE
: [inr0] "+r"(inr0),
[inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [w0] "w"(w0),
[w1] "w"(w1),
[w2] "w"(w2),
[w3] "w"(w3),
[w4] "w"(w4),
[w5] "w"(w5),
[w6] "w"(w6),
[w7] "w"(w7),
[w8] "w"(w8),
[bias] "r"(bias_local),
[vsix] "w"(vsix)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v19",
"v20",
"v21",
"v22");
#else
asm volatile(COMPUTE RELU RELU6 STORE
: [r0] "+r"(inr0),
[r1] "+r"(inr1),
[r2] "+r"(inr2),
[wc0] "+r"(weight_c),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [bias] "r"(bias_local), [six_ptr] "r"(vsix)
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
#endif
break;
case lite_api::ActivationType::kLeakyRelu:
#ifdef __aarch64__
asm volatile(COMPUTE LEAKY_RELU STORE
: [inr0] "+r"(inr0),
[inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [w0] "w"(w0),
[w1] "w"(w1),
[w2] "w"(w2),
[w3] "w"(w3),
[w4] "w"(w4),
[w5] "w"(w5),
[w6] "w"(w6),
[w7] "w"(w7),
[w8] "w"(w8),
[bias] "r"(bias_local),
[vscale] "w"(vscale)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v19",
"v20",
"v21",
"v22");
#else
asm volatile(COMPUTE LEAKY_RELU STORE
: [r0] "+r"(inr0),
[r1] "+r"(inr1),
[r2] "+r"(inr2),
[wc0] "+r"(weight_c),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [bias] "r"(bias_local), [scale_ptr] "r"(vscale)
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
#endif
break;
default:
LOG(FATAL) << "this act_type: "
<< static_cast<int>(act_param.active_type)
<< " fuse not support";
}
} else {
#ifdef __aarch64__
asm volatile(COMPUTE STORE
: [inr0] "+r"(inr0),
[inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [w0] "w"(w0),
[w1] "w"(w1),
[w2] "w"(w2),
[w3] "w"(w3),
[w4] "w"(w4),
[w5] "w"(w5),
[w6] "w"(w6),
[w7] "w"(w7),
[w8] "w"(w8),
[bias] "r"(bias_local)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v19",
"v20",
"v21",
"v22");
#else
asm volatile(COMPUTE STORE
: [r0] "+r"(inr0),
[r1] "+r"(inr1),
[r2] "+r"(inr2),
[wc0] "+r"(weight_c),
[outc0] "+r"(outc0),
[outc1] "+r"(outc1),
[outc2] "+r"(outc2),
[outc3] "+r"(outc3)
: [bias] "r"(bias_local)
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
#endif
}
}
void conv_3x3s2_depthwise_fp32(const float* i_data, void conv_3x3s2_depthwise_fp32(const float* i_data,
float* o_data, float* o_data,
...@@ -38,6 +543,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data, ...@@ -38,6 +543,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
const float* weights, const float* weights,
const float* bias, const float* bias,
const operators::ConvParam& param, const operators::ConvParam& param,
const operators::ActivationParam act_param,
ARMContext* ctx) { ARMContext* ctx) {
auto paddings = *param.paddings; auto paddings = *param.paddings;
int threads = ctx->threads(); int threads = ctx->threads();
...@@ -51,11 +557,9 @@ void conv_3x3s2_depthwise_fp32(const float* i_data, ...@@ -51,11 +557,9 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
const int win_round = ROUNDUP(win_ext, 4); const int win_round = ROUNDUP(win_ext, 4);
const int hin_round = oh * 2 + 1; const int hin_round = oh * 2 + 1;
const int prein_size = win_round * hin_round * out_c_block; const int prein_size = win_round * hin_round * out_c_block;
auto workspace_size = auto workspace_size = threads * prein_size + win_round + ow_round;
threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
ctx->ExtendWorkspace(sizeof(float) * workspace_size); ctx->ExtendWorkspace(sizeof(float) * workspace_size);
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr; bool flag_bias = param.bias != nullptr;
/// get workspace /// get workspace
...@@ -77,6 +581,8 @@ void conv_3x3s2_depthwise_fp32(const float* i_data, ...@@ -77,6 +581,8 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
remain = remain > 0 ? remain : 0; remain = remain > 0 ? remain : 0;
int row_len = win_round * out_c_block; int row_len = win_round * out_c_block;
float32x4_t vzero = vdupq_n_f32(0.f);
for (int n = 0; n < bs; ++n) { for (int n = 0; n < bs; ++n) {
const float* din_batch = i_data + n * ic * size_in_channel; const float* din_batch = i_data + n * ic * size_in_channel;
float* dout_batch = o_data + n * oc * size_out_channel; float* dout_batch = o_data + n * oc * size_out_channel;
...@@ -147,201 +653,47 @@ void conv_3x3s2_depthwise_fp32(const float* i_data, ...@@ -147,201 +653,47 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
outc2 = pre_out + 8; outc2 = pre_out + 8;
outc3 = pre_out + 12; outc3 = pre_out + 12;
} }
// clang-format off
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( act_switch_3x3s2(inr0,
"ldr q8, [%[bias]]\n" /* load bias */ inr1,
"ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ inr2,
"and v19.16b, v8.16b, v8.16b\n" outc0,
"ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ outc1,
"and v20.16b, v8.16b, v8.16b\n" outc2,
"ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/ outc3,
"and v21.16b, v8.16b, v8.16b\n" weight_c,
"ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/ bias_local,
"and v22.16b, v8.16b, v8.16b\n" w0,
"ldr q8, [%[inr0]]\n" /* load input r0*/ w1,
/* r0 mul w0-w2, get out */ w2,
"fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/ w3,
"fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/ w4,
"fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/ w5,
"fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/ w6,
"fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/ w7,
"ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/ w8,
"fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/ act_param);
"fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/
"fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/
"fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/
"ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/
"fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/
"ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/
"fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/
"ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/
"fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/
"ldr q8, [%[inr1]]\n" /* load input r1*/
/* r1, mul w3-w5, get out */
"fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/
"fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/
"fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/
"fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/
"fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/
"ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/
"fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/
"fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/
"fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/
"ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/
"ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/
"fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/
"ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/
"fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/
"ldr q8, [%[inr2]]\n" /* load input r2*/
/* r2, mul w6-w8, get out r0, r1 */
"fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/
"fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/
"fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/
"fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/
"fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/
"fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/
"fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/
"fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/
"fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/
"fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/
"fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/
"fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/
/* transpose */
"trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
"trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
"trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
"trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
"trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/
"trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/
"trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/
"trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/
/* relu */
"cbz %w[flag_relu], 0f\n" /* skip relu*/
"movi v0.4s, #0\n" /* for relu */
"fmax v19.4s, v19.4s, v0.4s\n"
"fmax v20.4s, v20.4s, v0.4s\n"
"fmax v21.4s, v21.4s, v0.4s\n"
"fmax v22.4s, v22.4s, v0.4s\n"
/* save result */
"0:\n"
"str q19, [%[outc0]], #16\n"
"str q20, [%[outc1]], #16\n"
"str q21, [%[outc2]], #16\n"
"str q22, [%[outc3]], #16\n"
:[inr0] "+r"(inr0), [inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
[w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
[w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
[bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
: "cc", "memory",
"v0","v1","v2","v3","v4","v5","v6","v7",
"v8", "v19","v20","v21","v22"
);
#else #else
asm volatile( act_switch_3x3s2(inr0,
/* fill with bias */ inr1,
"vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */ inr2,
/* load weights */ outc0,
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */ outc1,
"vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/ outc2,
"vand.i32 q12, q8, q8\n" outc3,
"vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/ weight_c,
"vand.i32 q13, q8, q8\n" bias_local,
"vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/ vzero,
"vand.i32 q14, q8, q8\n" vzero,
"vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/ vzero,
"vand.i32 q15, q8, q8\n" vzero,
"vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/ vzero,
/* mul r0 with w0, w1, w2 */ vzero,
"vmla.f32 q12, q9, q0 @ w0 * inr0\n" vzero,
"vmla.f32 q13, q9, q2 @ w0 * inr2\n" vzero,
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */ vzero,
"vmla.f32 q14, q9, q4 @ w0 * inr4\n" act_param);
"vmla.f32 q15, q9, q6 @ w0 * inr6\n" #endif
"vmla.f32 q12, q10, q1 @ w1 * inr1\n"
"vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w1 * inr3\n"
"vmla.f32 q14, q10, q5 @ w1 * inr5\n"
"vmla.f32 q15, q10, q7 @ w1 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */
"vmla.f32 q12, q11, q2 @ w2 * inr2\n"
"vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w2 * inr4\n"
"vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w2 * inr6\n"
"vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w2 * inr8\n"
/* mul r1 with w3, w4, w5 */
"vmla.f32 q12, q9, q0 @ w3 * inr0\n"
"vmla.f32 q13, q9, q2 @ w3 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */
"vmla.f32 q14, q9, q4 @ w3 * inr4\n"
"vmla.f32 q15, q9, q6 @ w3 * inr6\n"
"vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/
"vmla.f32 q12, q10, q1 @ w4 * inr1\n"
"vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w4 * inr3\n"
"vmla.f32 q14, q10, q5 @ w4 * inr5\n"
"vmla.f32 q15, q10, q7 @ w4 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */
"vmla.f32 q12, q11, q2 @ w5 * inr2\n"
"vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w5 * inr4\n"
"vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w5 * inr6\n"
"vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w5 * inr8\n"
/* mul r2 with w6, w7, w8 */
"vmla.f32 q12, q9, q0 @ w6 * inr0\n"
"vmla.f32 q13, q9, q2 @ w6 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */
"vmla.f32 q14, q9, q4 @ w6 * inr4\n"
"vmla.f32 q15, q9, q6 @ w6 * inr6\n"
"vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/
"vmla.f32 q12, q10, q1 @ w7 * inr1\n"
"vmla.f32 q13, q10, q3 @ w7 * inr3\n"
"vmla.f32 q14, q10, q5 @ w7 * inr5\n"
"vmla.f32 q15, q10, q7 @ w7 * inr7\n"
"sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n"
"vmla.f32 q12, q11, q2 @ w8 * inr2\n"
"vmla.f32 q13, q11, q4 @ w8 * inr4\n"
"vmla.f32 q14, q11, q6 @ w8 * inr6\n"
"vmla.f32 q15, q11, q8 @ w8 * inr8\n"
/* transpose */
"vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/
"vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/
"vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/
"vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/
"cmp %[flag_relu], #0\n"
"beq 0f\n" /* skip relu*/
"vmov.u32 q0, #0\n"
"vmax.f32 q12, q12, q0\n"
"vmax.f32 q13, q13, q0\n"
"vmax.f32 q14, q14, q0\n"
"vmax.f32 q15, q15, q0\n"
"0:\n"
"vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
"vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
"vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
"vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
:[r0] "+r"(inr0), [r1] "+r"(inr1),
[r2] "+r"(inr2), [wc0] "+r" (weight_c),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[bias] "r" (bias_local),
[flag_relu]"r"(flag_relu)
:"cc", "memory",
"q0","q1","q2","q3","q4","q5","q6","q7",
"q8", "q9","q10","q11","q12","q13","q14","q15"
);
#endif // __arch64__
// clang-format off
if (flag_mask) { if (flag_mask) {
for (int i = 0; i < remain; ++i) { for (int i = 0; i < remain; ++i) {
c0[i] = pre_out[i]; c0[i] = pre_out[i];
...@@ -350,6 +702,13 @@ void conv_3x3s2_depthwise_fp32(const float* i_data, ...@@ -350,6 +702,13 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
c3[i] = pre_out[i + 12]; c3[i] = pre_out[i + 12];
} }
} }
inr0 += 32;
inr1 += 32;
inr2 += 32;
outc0 += 4;
outc1 += 4;
outc2 += 4;
outc3 += 4;
} }
} }
} }
......
...@@ -2151,6 +2151,210 @@ inline void act_switch_c8_fp32(const float* din_ptr, ...@@ -2151,6 +2151,210 @@ inline void act_switch_c8_fp32(const float* din_ptr,
} }
} }
#ifdef __aarch64__
#define LOAD_DATA \
"1: \n" \
"ld1 {v0.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"ld1 {v1.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"ld1 {v2.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"ld1 {v3.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/
#define DO_RELU \
"fmax v0.4s, v0.4s, %[vzero].4s \n" /* vmaxq_f32() */ \
"fmax v1.4s, v1.4s, %[vzero].4s \n" /* vmaxq_f32() */ \
"fmax v2.4s, v2.4s, %[vzero].4s \n" /* vmaxq_f32() */ \
"fmax v3.4s, v3.4s, %[vzero].4s \n" /* vmaxq_f32() */
#define DO_RELU6 \
"fmin v0.4s, v0.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v1.4s, v1.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v2.4s, v2.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v3.4s, v3.4s, %[vsix].4s \n" /* vmaxq_f32() */
#define DO_LEAKY_RELU \
"cmhs v4.4s, v0.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v5.4s, v0.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v6.4s, v1.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v7.4s, v1.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v8.4s, v2.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v9.4s, v2.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v10.4s, v3.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v11.4s, v3.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"bif v0.16b, v5.16b, v4.16b \n" /* choose*/ \
"bif v1.16b, v7.16b, v6.16b \n" /* choose*/ \
"bif v2.16b, v9.16b, v8.16b \n" /* choose*/ \
"bif v3.16b, v11.16b, v10.16b \n" /* choose*/
#define DO_STORE \
"subs %w[cnt], %w[cnt], #1 \n" \
"st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"bne 1b \n"
#else
#define LOAD_DATA \
"1: \n" \
"vld1.32 {d6-d7}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d8-d9}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n"
#define DO_RELU \
"vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
#define DO_RELU6 \
"vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
"vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
"vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
"vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
#define DO_LEAKY_RELU \
"vcge.f32 q7, q3, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q8, q3, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q9, q4, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q10, q4, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q11, q5, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q12, q5, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q13, q6, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q14, q6, %q[vscale] @ vmulq_f32 \n" \
"vbif q3, q8, q7 @ choose \n" \
"vbif q4, q10, q9 @ choose \n" \
"vbif q5, q12, q11 @ choose \n" \
"vbif q6, q13, q13 @ choose \n"
#define DO_STORE \
"subs %[cnt], #1 \n" \
"vst1.32 {d6-d7}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d8-d9}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d10-d11}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d12-d13}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"bne 1b \n"
#endif
/*
* Data do activation process
* Now support relu relu6 leakyrelu act
*/
inline void act_switch_process(float* src,
float* dst,
int size,
const operators::ActivationParam* act_param) {
int cnt = size >> 4;
int remain = size % 16;
float32x4_t vzero = vdupq_n_f32(0.f);
if (act_param != nullptr && act_param->has_active) {
float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
if (cnt > 0) {
switch (act_param->active_type) {
case lite_api::ActivationType::kRelu:
#ifdef __aarch64__
asm volatile(
LOAD_DATA DO_RELU DO_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero)
: "memory", "cc", "v0", "v1", "v2", "v3");
#else
asm volatile(
LOAD_DATA DO_RELU DO_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero)
: "memory", "cc", "q3", "q4", "q5", "q6");
#endif
break;
case lite_api::ActivationType::kRelu6:
#ifdef __aarch64__
asm volatile(
LOAD_DATA DO_RELU DO_RELU6 DO_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vsix] "w"(vsix)
: "memory", "cc", "v0", "v1", "v2", "v3");
#else
asm volatile(
LOAD_DATA DO_RELU DO_RELU6 DO_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vsix] "w"(vsix)
: "memory", "cc", "q3", "q4", "q5", "q6");
#endif
break;
case lite_api::ActivationType::kLeakyRelu:
#ifdef __aarch64__
asm volatile(
LOAD_DATA DO_LEAKY_RELU DO_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vscale] "w"(vscale)
: "memory",
"cc",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11");
#else
asm volatile(
LOAD_DATA DO_LEAKY_RELU DO_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vscale] "w"(vscale)
: "memory",
"cc",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14");
#endif
break;
default:
LOG(FATAL) << "this act_type: "
<< static_cast<int>(act_param->active_type)
<< " fuse not support";
}
}
// remain
switch (act_param->active_type) {
case lite_api::ActivationType::kRelu:
for (int i = 0; i < remain; i++) {
*dst = *src >= 0.f ? *src : 0.f;
src++;
dst++;
}
case lite_api::ActivationType::kRelu6:
for (int i = 0; i < remain; i++) {
float tmp = *src >= 0.f ? *src : 0.f;
*dst = tmp <= act_param->Relu_clipped_coef
? tmp
: act_param->Relu_clipped_coef;
src++;
dst++;
}
case lite_api::ActivationType::kLeakyRelu:
for (int i = 0; i < remain; i++) {
if (*src >= 0.f) {
*dst = *src;
} else {
*dst = *src * act_param->Leaky_relu_alpha;
}
src++;
dst++;
}
break;
default:
LOG(FATAL) << "this act_type: "
<< static_cast<int>(act_param->active_type)
<< " fuse not support";
}
}
}
/*wirte result in outputs /*wirte result in outputs
* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w] * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
*/ */
......
...@@ -52,6 +52,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data, ...@@ -52,6 +52,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
const float* weights, const float* weights,
const float* bias, const float* bias,
const operators::ConvParam& param, const operators::ConvParam& param,
const operators::ActivationParam act_param,
ARMContext* ctx); ARMContext* ctx);
void conv_depthwise_3x3s1_fp32(const float* din, void conv_depthwise_3x3s1_fp32(const float* din,
...@@ -67,7 +68,6 @@ void conv_depthwise_3x3s1_fp32(const float* din, ...@@ -67,7 +68,6 @@ void conv_depthwise_3x3s1_fp32(const float* din,
const float* bias, const float* bias,
int pad, int pad,
bool flag_bias, bool flag_bias,
bool flag_relu,
const operators::ActivationParam act_param, const operators::ActivationParam act_param,
ARMContext* ctx); ARMContext* ctx);
...@@ -84,7 +84,7 @@ void conv_depthwise_3x3s2_fp32(const float* din, ...@@ -84,7 +84,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
const float* bias, const float* bias,
int pad, int pad,
bool flag_bias, bool flag_bias,
bool flag_relu, const operators::ActivationParam act_param,
ARMContext* ctx); ARMContext* ctx);
template <typename Dtype> template <typename Dtype>
......
...@@ -584,7 +584,6 @@ void conv_depthwise_3x3_fp32(const void* din, ...@@ -584,7 +584,6 @@ void conv_depthwise_3x3_fp32(const void* din,
const int pad_w = paddings[2]; const int pad_w = paddings[2];
int stride = param.strides[1]; int stride = param.strides[1];
int pad = pad_w; int pad = pad_w;
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr; bool flag_bias = param.bias != nullptr;
bool pads_equal = bool pads_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
...@@ -603,7 +602,6 @@ void conv_depthwise_3x3_fp32(const void* din, ...@@ -603,7 +602,6 @@ void conv_depthwise_3x3_fp32(const void* din,
bias, bias,
pad, pad,
flag_bias, flag_bias,
flag_relu,
act_param, act_param,
ctx); ctx);
} else { } else {
...@@ -638,7 +636,7 @@ void conv_depthwise_3x3_fp32(const void* din, ...@@ -638,7 +636,7 @@ void conv_depthwise_3x3_fp32(const void* din,
bias, bias,
pad, pad,
flag_bias, flag_bias,
flag_relu, act_param,
ctx); ctx);
} else { } else {
conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din), conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din),
...@@ -653,6 +651,7 @@ void conv_depthwise_3x3_fp32(const void* din, ...@@ -653,6 +651,7 @@ void conv_depthwise_3x3_fp32(const void* din,
reinterpret_cast<const float*>(weights), reinterpret_cast<const float*>(weights),
bias, bias,
param, param,
act_param,
ctx); ctx);
} }
} else { } else {
......
...@@ -1404,8 +1404,8 @@ void sgemm_prepack_c4_small(int M, ...@@ -1404,8 +1404,8 @@ void sgemm_prepack_c4_small(int M,
/* load a0, a1 */ /* load a0, a1 */
"ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n"
"bne 1b \n" "bne 1b \n"
"fadd v8.4s, v8.4s, v9.4s \n"
"2:\n" "2:\n"
"fadd v8.4s, v8.4s, v9.4s \n"
"st1 {v8.4s}, [%[c]], #16 \n" "st1 {v8.4s}, [%[c]], #16 \n"
: [a] "+r" (a_ptr), : [a] "+r" (a_ptr),
[b] "+r" (b_ptr), [b] "+r" (b_ptr),
...@@ -1660,8 +1660,8 @@ void sgemm_prepack_c4_small(int M, ...@@ -1660,8 +1660,8 @@ void sgemm_prepack_c4_small(int M,
/* load a0, a1 */ /* load a0, a1 */
"vld1.32 {d2-d5}, [%[a]]! \n" "vld1.32 {d2-d5}, [%[a]]! \n"
"bne 1b \n" "bne 1b \n"
"vadd.f32 q5, q5, q6 \n"
"2:\n" "2:\n"
"vadd.f32 q5, q5, q6 \n"
"vst1.32 {d10-d11}, [%[c]]!\n" "vst1.32 {d10-d11}, [%[c]]!\n"
: [a] "+r" (a_ptr), : [a] "+r" (a_ptr),
[b] "+r" (b_ptr), [b] "+r" (b_ptr),
......
...@@ -89,9 +89,15 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param, ...@@ -89,9 +89,15 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0)); this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
} }
#if CUDNN_VERSION_MIN(7, 0, 0)
cudnnMathType_t math_type =
use_tensor_core_ ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
CUDNN_CHECK(cudnnSetConvolutionMathType(this->conv_desc_, math_type));
#endif
if (ic == param.groups && ic == oc && ic != 1) { if (ic == param.groups && ic == oc && ic != 1) {
this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} else if (1) { } else if (!param.var_length) {
const auto* i_data = param.x->data<float>(); const auto* i_data = param.x->data<float>();
const auto* w_data = param.filter->data<float>(); const auto* w_data = param.filter->data<float>();
auto* o_data = param.output->mutable_data<float>(TARGET(kCUDA)); auto* o_data = param.output->mutable_data<float>(TARGET(kCUDA));
......
...@@ -55,6 +55,8 @@ class Gemm { ...@@ -55,6 +55,8 @@ class Gemm {
PtypeOut* c, PtypeOut* c,
Context<TARGET(kCUDA)>* ctx); Context<TARGET(kCUDA)>* ctx);
cublasHandle_t get_handle() const { return cu_handle_; }
private: private:
cudaStream_t exe_stream_; cudaStream_t exe_stream_;
cublasHandle_t cu_handle_; cublasHandle_t cu_handle_;
......
...@@ -30,7 +30,12 @@ std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build( ...@@ -30,7 +30,12 @@ std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
// The XPU compiler build the graph and fill all of the constant params, only // The XPU compiler build the graph and fill all of the constant params, only
// one output is supported now. // one output is supported now.
xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0])); xtcl::Array<xtcl::xExpr> all_outs;
for (size_t i = 0; i < outputs->size(); i++) {
all_outs.push_back(*outputs->at(i));
}
xtcl::xNetwork network =
builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs));
auto target = xtcl::Target::Create(device_name_); auto target = xtcl::Target::Create(device_name_);
auto compiler = xtcl::network::xTensorCompiler(network, target); auto compiler = xtcl::network::xTensorCompiler(network, target);
compiler.SetParams(*params); // Set the data of constant tensors compiler.SetParams(*params); // Set the data of constant tensors
......
...@@ -35,12 +35,12 @@ void TestCase::CreateInstruction() { ...@@ -35,12 +35,12 @@ void TestCase::CreateInstruction() {
op_desc_.reset(new cpp::OpDesc()); op_desc_.reset(new cpp::OpDesc());
op_desc_->SetType("subgraph"); op_desc_->SetType("subgraph");
op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx); op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
op_desc_->SetInput("Inputs", op_desc_->input_vars()); auto in_names = sub_block_op_desc->input_vars();
op_desc_->SetOutput("Outputs", op_desc_->output_vars()); auto out_names = sub_block_op_desc->output_vars();
op_desc_->SetAttr<std::vector<std::string>>( op_desc_->SetInput("Inputs", in_names);
"input_data_names", sub_block_op_desc->input_vars()); op_desc_->SetOutput("Outputs", out_names);
op_desc_->SetAttr<std::vector<std::string>>( op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
"output_data_names", sub_block_op_desc->output_vars()); op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
op = LiteOpRegistry::Global().Create(op_desc().Type()); op = LiteOpRegistry::Global().Create(op_desc().Type());
static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc); static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
} else { } else {
......
...@@ -188,13 +188,17 @@ class Arena { ...@@ -188,13 +188,17 @@ class Arena {
tester_->Prepare(); tester_->Prepare();
} }
bool TestPrecision() { bool TestPrecision(const std::vector<std::string>& exclude_outs = {}) {
tester_->RunBaseline(tester_->baseline_scope()); tester_->RunBaseline(tester_->baseline_scope());
tester_->RunInstruction(); tester_->RunInstruction();
bool success = true; bool success = true;
for (auto& out : tester_->op_desc().OutputArgumentNames()) { for (auto& out : tester_->op_desc().OutputArgumentNames()) {
for (auto& var : tester_->op_desc().Output(out)) { for (auto& var : tester_->op_desc().Output(out)) {
if (std::find(exclude_outs.begin(), exclude_outs.end(), var) !=
exclude_outs.end()) {
continue;
}
success = success && CompareTensor(out, var); success = success && CompareTensor(out, var);
} }
} }
...@@ -209,7 +213,17 @@ class Arena { ...@@ -209,7 +213,17 @@ class Arena {
} }
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - timer); std::chrono::high_resolution_clock::now() - timer);
LOG(INFO) << "average duration: " << duration.count() << " ms";
timer = std::chrono::high_resolution_clock::now();
for (int i = 0; i < times; i++) {
tester_->RunBaseline(tester_->baseline_scope());
}
auto duration_basic = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - timer);
LOG(INFO) << "average lite duration: " << duration.count() << " ms";
LOG(INFO) << "average basic duration: " << duration_basic.count() << " ms";
LOG(INFO) << "speed up ratio: lite_speed / basic_speed: "
<< static_cast<float>(duration_basic.count()) / duration.count();
} }
private: private:
......
...@@ -16,9 +16,11 @@ lite_cc_library(mir_passes ...@@ -16,9 +16,11 @@ lite_cc_library(mir_passes
fusion/interpolate_fuse_pass.cc fusion/interpolate_fuse_pass.cc
fusion/conv_elementwise_fuse_pass.cc fusion/conv_elementwise_fuse_pass.cc
fusion/conv_activation_fuse_pass.cc fusion/conv_activation_fuse_pass.cc
fusion/var_conv_2d_activation_fuse_pass.cc
fusion/conv_bn_fuse_pass.cc fusion/conv_bn_fuse_pass.cc
fusion/elementwise_add_activation_fuse_pass.cc fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc elimination/identity_scale_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc
static_kernel_pick_pass.cc static_kernel_pick_pass.cc
......
...@@ -10,6 +10,9 @@ lite_cc_library(fuse_conv_elementwise ...@@ -10,6 +10,9 @@ lite_cc_library(fuse_conv_elementwise
lite_cc_library(fuse_conv_activation lite_cc_library(fuse_conv_activation
SRCS conv_activation_fuser.cc SRCS conv_activation_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
lite_cc_library(fuse_var_conv_activation
SRCS var_conv_2d_activation_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_conv_bn lite_cc_library(fuse_conv_bn
SRCS conv_bn_fuser.cc SRCS conv_bn_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
...@@ -25,17 +28,22 @@ lite_cc_library(fuse_transpose_softmax_transpose ...@@ -25,17 +28,22 @@ lite_cc_library(fuse_transpose_softmax_transpose
lite_cc_library(fuse_interpolate lite_cc_library(fuse_interpolate
SRCS interpolate_fuser.cc SRCS interpolate_fuser.cc
DEPS pattern_matcher_high_api) DEPS pattern_matcher_high_api)
lite_cc_library(fuse_sequence_pool_concat
SRCS sequence_pool_concat_fuser.cc
DEPS pattern_matcher_high_api)
set(mir_fusers set(mir_fusers
fuse_fc fuse_fc
fuse_shuffle_channel fuse_shuffle_channel
fuse_conv_elementwise fuse_conv_elementwise
fuse_conv_activation fuse_conv_activation
fuse_var_conv_activation
fuse_conv_bn fuse_conv_bn
fuse_quant_dequant fuse_quant_dequant
fuse_elementwise_add_activation fuse_elementwise_add_activation
fuse_transpose_softmax_transpose fuse_transpose_softmax_transpose
fuse_interpolate fuse_interpolate
fuse_sequence_pool_concat
CACHE INTERNAL "fusers") CACHE INTERNAL "fusers")
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/sequence_pool_concat_fuser.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void SequencePoolConcatFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fusion::SequencePoolConcatFuser fuser;
fuser(graph.get());
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_sequence_pool_concat_fuse_pass,
paddle::lite::mir::SequencePoolConcatFusePass)
.BindTargets({TARGET(kCUDA)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
class SequencePoolConcatFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/sequence_pool_concat_fuser.h"
#include <memory>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
// """
// merge {sequence_pool x 7, concat} => merge_sequence_pool_and_concat
// src1 src2 src7 src1 src2 src7
// | | | | |
// v v | | ... |
// sequence_pool sequence_pool ...(sequence_pool) | | |
// | | | => -------------------
// --------------------------------- |
// | |
// v v
// concat sequence_pool_concat
// """
void SequencePoolConcatFuser::BuildPattern() {
// create nodes.
auto* concat = OpNode("concat", "concat")->AsIntermediate();
#define STR1(R) #R
#define STR2(R) STR1(R)
#define POOL_CONCAT_PATTERN(num) \
auto* x_##num = VarNode(STR2(sequence_pool_x_##num)) \
->assert_is_op_input("sequence_pool", "X") \
->AsInput(); \
auto* sequence_pool_##num = \
OpNode(STR2(sequence_pool_##num), "sequence_pool")->AsIntermediate(); \
auto* sequence_pool_##num##_out = \
VarNode(STR2(sequence_pool_##num##_out)) \
->assert_is_op_output("sequence_pool", "Out") \
->assert_is_op_nth_input("concat", "X", num - 1) \
->AsIntermediate(); \
auto* sequence_pool_##num##_idx = \
VarNode(STR2(sequence_pool_##num##_idx)) \
->assert_is_op_output("sequence_pool", "MaxIndex") \
->AsIntermediate(); \
*sequence_pool_##num >> *sequence_pool_##num##_idx; \
*x_##num >> *sequence_pool_##num >> *sequence_pool_##num##_out >> *concat;
auto* concat_out =
VarNode("concat_out")->assert_is_op_output("concat", "Out");
*concat >> *concat_out;
POOL_CONCAT_PATTERN(1);
POOL_CONCAT_PATTERN(2);
POOL_CONCAT_PATTERN(3);
POOL_CONCAT_PATTERN(4);
POOL_CONCAT_PATTERN(5);
POOL_CONCAT_PATTERN(6);
POOL_CONCAT_PATTERN(7);
#undef POOL_CONCAT_PATTERN
#undef STR1
#undef STR2
}
void SequencePoolConcatFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched);
auto sequence_pool_concat_op =
LiteOpRegistry::Global().Create("sequence_pool_concat");
auto concat = matched.at("concat")->stmt()->op();
auto* scope = concat->scope();
auto& valid_places = concat->valid_places();
sequence_pool_concat_op->Attach(op_desc, scope);
auto* new_op_node =
graph->GraphCreateInstructNode(sequence_pool_concat_op, valid_places);
IR_NODE_LINK_TO(matched.at("sequence_pool_x_1"), new_op_node);
IR_NODE_LINK_TO(matched.at("sequence_pool_x_2"), new_op_node);
IR_NODE_LINK_TO(matched.at("sequence_pool_x_3"), new_op_node);
IR_NODE_LINK_TO(matched.at("sequence_pool_x_4"), new_op_node);
IR_NODE_LINK_TO(matched.at("sequence_pool_x_5"), new_op_node);
IR_NODE_LINK_TO(matched.at("sequence_pool_x_6"), new_op_node);
IR_NODE_LINK_TO(matched.at("sequence_pool_x_7"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("concat_out"));
}
cpp::OpDesc SequencePoolConcatFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc = *matched.at("concat")->stmt()->op_info();
op_desc.SetType("sequence_pool_concat");
op_desc.SetInput("X",
{matched.at("sequence_pool_x_1")->arg()->name,
matched.at("sequence_pool_x_2")->arg()->name,
matched.at("sequence_pool_x_3")->arg()->name,
matched.at("sequence_pool_x_4")->arg()->name,
matched.at("sequence_pool_x_5")->arg()->name,
matched.at("sequence_pool_x_6")->arg()->name,
matched.at("sequence_pool_x_7")->arg()->name});
std::vector<std::string> pooltypes;
pooltypes.push_back(matched.at("sequence_pool_1")
->stmt()
->op_info()
->GetAttr<std::string>("pooltype"));
pooltypes.push_back(matched.at("sequence_pool_2")
->stmt()
->op_info()
->GetAttr<std::string>("pooltype"));
pooltypes.push_back(matched.at("sequence_pool_3")
->stmt()
->op_info()
->GetAttr<std::string>("pooltype"));
pooltypes.push_back(matched.at("sequence_pool_4")
->stmt()
->op_info()
->GetAttr<std::string>("pooltype"));
pooltypes.push_back(matched.at("sequence_pool_5")
->stmt()
->op_info()
->GetAttr<std::string>("pooltype"));
pooltypes.push_back(matched.at("sequence_pool_6")
->stmt()
->op_info()
->GetAttr<std::string>("pooltype"));
pooltypes.push_back(matched.at("sequence_pool_7")
->stmt()
->op_info()
->GetAttr<std::string>("pooltype"));
op_desc.SetAttr("pooltype", pooltypes);
op_desc.SetOutput("Out", {matched.at("concat_out")->arg()->name});
return op_desc;
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class SequencePoolConcatFuser : public FuseBase {
public:
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
};
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void VarConv2dActivationFusePass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
std::vector<std::string> act_types{"relu"};
for (auto act_type : act_types) {
fusion::VarConvActivationFuser fuser(act_type, "var_conv_2d");
fuser(graph.get());
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_var_conv_2d_activation_fuse_pass,
paddle::lite::mir::VarConv2dActivationFusePass)
.BindTargets({TARGET(kCUDA)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
class VarConv2dActivationFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
#include <memory>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
void VarConvActivationFuser::BuildPattern() {
// create nodes.
auto* input = VarNode("X")->assert_is_op_input(conv_type_, "X")->AsInput();
auto* filter = VarNode("W")->assert_is_op_input(conv_type_, "W")->AsInput();
auto* conv2d = OpNode("var_conv_2d", conv_type_)->AsIntermediate();
auto* act = OpNode("act", act_type_)->AsIntermediate();
auto* conv2d_out = VarNode("conv2d_out")
->assert_is_op_output(conv_type_, "Out")
->assert_is_op_input(act_type_, "X")
->AsIntermediate();
auto* conv2d_out_1 = VarNode("conv2d_out_1")
->assert_is_op_output(conv_type_, "Col")
->AsIntermediate();
auto* out =
VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
// create topology.
std::vector<PMNode*> conv2d_inputs{filter, input};
conv2d_inputs >> *conv2d >> *conv2d_out >> *act >> *out;
*conv2d >> *conv2d_out_1;
}
void VarConvActivationFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched);
auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
auto conv_old = matched.at("var_conv_2d")->stmt()->op();
auto* scope = conv_old->scope();
auto& valid_places = conv_old->valid_places();
conv_op->Attach(op_desc, scope);
auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
IR_NODE_LINK_TO(matched.at("X"), new_op_node);
IR_NODE_LINK_TO(matched.at("W"), new_op_node);
IR_NODE_LINK_TO(new_op_node, matched.at("output"));
}
cpp::OpDesc VarConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc = *matched.at("var_conv_2d")->stmt()->op_info();
op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
if (act_type_ == "relu") {
op_desc.SetAttr("fuse_relu", true);
}
return op_desc;
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class VarConvActivationFuser : public FuseBase {
public:
explicit VarConvActivationFuser(const std::string& act_type,
const std::string& conv_type)
: act_type_(act_type), conv_type_(conv_type) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
std::string act_type_;
std::string conv_type_;
};
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -62,12 +62,14 @@ class Optimizer { ...@@ -62,12 +62,14 @@ class Optimizer {
// TODO(Superjomn) Refine the fusion related design to select fusion // TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically. // kernels for devices automatically.
"lite_conv_activation_fuse_pass", // "lite_conv_activation_fuse_pass", //
"lite_var_conv_2d_activation_fuse_pass", //
"lite_fc_fuse_pass", // "lite_fc_fuse_pass", //
"lite_shuffle_channel_fuse_pass", // "lite_shuffle_channel_fuse_pass", //
"lite_transpose_softmax_transpose_fuse_pass", // "lite_transpose_softmax_transpose_fuse_pass", //
"lite_interpolate_fuse_pass", // "lite_interpolate_fuse_pass", //
"identity_scale_eliminate_pass", // "identity_scale_eliminate_pass", //
"elementwise_mul_constant_eliminate_pass", // "elementwise_mul_constant_eliminate_pass", //
"lite_sequence_pool_concat_fuse_pass", //
#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \ #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
(defined LITE_WITH_ARM) (defined LITE_WITH_ARM)
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
......
...@@ -262,14 +262,10 @@ void Instruction::Run() { ...@@ -262,14 +262,10 @@ void Instruction::Run() {
if (op_->run_once() && has_run_) { if (op_->run_once() && has_run_) {
return; return;
} }
#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "kernel launch";
VLOG(4) << "kernel launch";
#endif
op_->InferShape(); op_->InferShape();
#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target " // << TargetToStr(kernel_->target());
<< TargetToStr(kernel_->target());
#endif
kernel_->Launch(); kernel_->Launch();
has_run_ = true; has_run_ = true;
} }
......
...@@ -49,6 +49,7 @@ add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_ ...@@ -49,6 +49,7 @@ add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_
add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(instance_norm_compute_arm ARM basic SRCS instance_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(instance_norm_compute_arm ARM basic SRCS instance_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(grid_sampler_compute_arm ARM basic SRCS grid_sampler_compute.cc DEPS ${lite_kernel_deps} math_arm)
## 2.other basic kernels: basic kernels that not used in basic models ## 2.other basic kernels: basic kernels that not used in basic models
add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
......
...@@ -65,20 +65,20 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -65,20 +65,20 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
no_dilation && flag_dw) { no_dilation && flag_dw) {
/// dw conv impl /// dw conv impl
impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>; impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
VLOG(3) << "invoking dw conv"; // VLOG(3) << "invoking dw conv";
} else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal && } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
no_dilation && pads_all_equal) { no_dilation && pads_all_equal) {
/// winograd conv impl /// winograd conv impl
impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>; impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
VLOG(3) << "invoking winograd conv"; // VLOG(3) << "invoking winograd conv";
} else if (param.groups == 1 && kw == 3 && stride == 2 && } else if (param.groups == 1 && kw == 3 && stride == 2 &&
chin * chout < 4 * hin * win && kps_equal && no_dilation) { chin * chout < 4 * hin * win && kps_equal && no_dilation) {
/// direct conv impl /// direct conv impl
impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>; impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
VLOG(3) << "invoking direct conv"; // VLOG(3) << "invoking direct conv";
} else { } else {
impl_ = new GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>; impl_ = new GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>;
VLOG(3) << "invoking gemm like conv"; // VLOG(3) << "invoking gemm like conv";
} }
impl_->SetContext(std::move(this->ctx_)); impl_->SetContext(std::move(this->ctx_));
impl_->SetParam(param); impl_->SetParam(param);
...@@ -117,14 +117,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() { ...@@ -117,14 +117,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
if (param.groups == ic && ic == oc && kps_equal && pads_equal && if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
no_dilation && flag_dw) { no_dilation && flag_dw) {
impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>; impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
VLOG(3) << "Run DepthwiseConv Int8"; // VLOG(3) << "Run DepthwiseConv Int8";
} else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
kps_equal && no_dilation) { kps_equal && no_dilation) {
impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>; impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>;
VLOG(3) << "Run DirectConv Int8"; // VLOG(3) << "Run DirectConv Int8";
} else { } else {
impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>; impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>;
VLOG(3) << "Run GemmLikeConvInt8"; // VLOG(3) << "Run GemmLikeConvInt8";
} }
impl_->SetContext(std::move(this->ctx_)); impl_->SetContext(std::move(this->ctx_));
impl_->SetParam(param); impl_->SetParam(param);
...@@ -163,14 +163,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -163,14 +163,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
if (param.groups == ic && ic == oc && kps_equal && pads_equal && if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
no_dilation && flag_dw) { no_dilation && flag_dw) {
impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>; impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
VLOG(3) << "Run DepthwiseConv Int8"; // VLOG(3) << "Run DepthwiseConv Int8";
} else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
kps_equal && no_dilation) { kps_equal && no_dilation) {
impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>; impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>;
VLOG(3) << "Run DirectConv Int8"; // VLOG(3) << "Run DirectConv Int8";
} else { } else {
impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>; impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>;
VLOG(3) << "Run GemmLikeConvInt8"; // VLOG(3) << "Run GemmLikeConvInt8";
} }
impl_->SetContext(std::move(this->ctx_)); impl_->SetContext(std::move(this->ctx_));
impl_->SetParam(param); impl_->SetParam(param);
......
...@@ -30,7 +30,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -30,7 +30,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
auto kw = w_dims[3]; auto kw = w_dims[3];
// select dw conv kernel // select dw conv kernel
if (kw == 3) { if (kw == 3) {
VLOG(5) << "invoke 3x3 dw conv fp32"; // VLOG(5) << "invoke 3x3 dw conv fp32";
auto paddings = *param.paddings; auto paddings = *param.paddings;
bool pads_equal = bool pads_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
...@@ -54,7 +54,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -54,7 +54,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
flag_trans_weights_ = true; flag_trans_weights_ = true;
} }
} else if (kw == 5) { } else if (kw == 5) {
VLOG(5) << "invoke 5x5 dw conv fp32"; // VLOG(5) << "invoke 5x5 dw conv fp32";
impl_ = lite::arm::math::conv_depthwise_5x5_fp32; impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
} else { } else {
LOG(FATAL) << "this type dw conv not impl"; LOG(FATAL) << "this type dw conv not impl";
...@@ -86,7 +86,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() { ...@@ -86,7 +86,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
/// select dw conv kernel /// select dw conv kernel
if (kw == 3) { if (kw == 3) {
// trans weights // trans weights
VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out"; // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32; impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
...@@ -96,7 +96,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() { ...@@ -96,7 +96,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
flag_trans_weights_ = true; flag_trans_weights_ = true;
} else if (kw == 5) { } else if (kw == 5) {
// trans weights // trans weights
VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out"; // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32; impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
...@@ -145,7 +145,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -145,7 +145,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
/// select dw conv kernel /// select dw conv kernel
if (kw == 3) { if (kw == 3) {
// trans weights // trans weights
VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out"; // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8; impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
...@@ -155,7 +155,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() { ...@@ -155,7 +155,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
flag_trans_weights_ = true; flag_trans_weights_ = true;
} else if (kw == 5) { } else if (kw == 5) {
// trans weights // trans weights
VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out"; // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8; impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
int cround = ROUNDUP(w_dims[0], 8); int cround = ROUNDUP(w_dims[0], 8);
weights_.Resize({cround / 8, 1, kh * kw, 8}); weights_.Resize({cround / 8, 1, kh * kw, 8});
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/grid_sampler_compute.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void GridSamplerCompute::PrepareForRun() {}
void GridSamplerCompute::Run() {
auto& param = this->Param<param_t>();
auto n = param.x->dims()[0];
auto c = param.x->dims()[1];
auto h = param.x->dims()[2];
auto w = param.x->dims()[3];
const float* in = param.x->data<float>();
const float* grid = param.grid->data<float>();
float* out = param.out->mutable_data<float>();
auto& ctx = this->ctx_->template As<ARMContext>();
const size_t coor_size = n * h * w;
const size_t workspace_size = coor_size * 12 * sizeof(float);
ctx.ExtendWorkspace(workspace_size);
int32_t* coor_p = ctx.workspace_data<int>();
float* dis_p = reinterpret_cast<float*>(coor_p) + coor_size * 4;
uint32_t* bound_p = reinterpret_cast<uint32_t*>(dis_p) + coor_size * 4;
float x_max = static_cast<float>(w - 1);
float y_max = static_cast<float>(h - 1);
float32x4_t vxmax = vdupq_n_f32(x_max);
float32x4_t vymax = vdupq_n_f32(y_max);
float32x4_t vone = vdupq_n_f32(1.f);
float32x4_t vzero = vdupq_n_f32(0.f);
// compute coor, dis, bound
int i = coor_size;
for (; i > 3; i -= 4) {
float32x4x2_t xy = vld2q_f32(grid);
float32x4_t grid_x = vmulq_n_f32(vaddq_f32(xy.val[0], vone), 0.5 * x_max);
float32x4_t grid_y = vmulq_n_f32(vaddq_f32(xy.val[1], vone), 0.5 * y_max);
grid += 8;
// compute xw, we, yn, ys
int32x4x4_t vcoor;
vcoor.val[0] = vcvtq_s32_f32(grid_x);
vcoor.val[2] = vcvtq_s32_f32(grid_y);
float32x4_t vxwf = vcvtq_f32_s32(vcoor.val[0]);
float32x4_t vynf = vcvtq_f32_s32(vcoor.val[2]);
float32x4_t vxef = vaddq_f32(vxwf, vone);
float32x4_t vysf = vaddq_f32(vynf, vone);
vcoor.val[1] = vcvtq_s32_f32(vxef);
vcoor.val[3] = vcvtq_s32_f32(vysf);
vst4q_s32(coor_p, vcoor);
coor_p += 16;
// compute dw, dn ,de, ds
float32x4x4_t vdis;
vdis.val[0] = vsubq_f32(grid_x, vxwf);
vdis.val[2] = vsubq_f32(grid_y, vynf);
vdis.val[1] = vsubq_f32(vxef, grid_x);
vdis.val[3] = vsubq_f32(vysf, grid_y);
vst4q_f32(dis_p, vdis);
dis_p += 16;
// compute bound
uint32x4x4_t vbound;
uint32x4_t logic_xw =
vorrq_u32(vcltq_f32(vxwf, vzero), vcgtq_f32(vxwf, vxmax));
uint32x4_t logic_xe =
vorrq_u32(vcltq_f32(vxef, vzero), vcgtq_f32(vxef, vxmax));
uint32x4_t logic_yn =
vorrq_u32(vcltq_f32(vynf, vzero), vcgtq_f32(vynf, vymax));
uint32x4_t logic_ys =
vorrq_u32(vcltq_f32(vysf, vzero), vcgtq_f32(vysf, vymax));
vbound.val[0] = vmvnq_u32(vorrq_u32(logic_xw, logic_yn));
vbound.val[1] = vmvnq_u32(vorrq_u32(logic_xe, logic_yn));
vbound.val[2] = vmvnq_u32(vorrq_u32(logic_xw, logic_ys));
vbound.val[3] = vmvnq_u32(vorrq_u32(logic_xe, logic_ys));
vst4q_u32(bound_p, vbound);
bound_p += 16;
}
for (; i > 0; i--) {
float x = grid[0];
float y = grid[1];
float grid_x = (x + 1) * 0.5 * x_max;
float grid_y = (y + 1) * 0.5 * y_max;
grid += 2;
// compute xw, xe, yn, ys
int32_t xw = static_cast<int32_t>(floor(grid_x));
int32_t xe = xw + 1;
int32_t yn = static_cast<int32_t>(floor(grid_y));
int32_t ys = yn + 1;
*coor_p++ = xw;
*coor_p++ = xe;
*coor_p++ = yn;
*coor_p++ = ys;
// compute dw, de, dn, ds
float dw = grid_x - xw;
float de = xe - grid_x;
float dn = grid_y - yn;
float ds = ys - grid_y;
*dis_p++ = dw;
*dis_p++ = de;
*dis_p++ = dn;
*dis_p++ = ds;
// compute bound
bool logic_xw = (xw < 0.f || xw > x_max);
bool logic_xe = (xe < 0.f || xe > x_max);
bool logic_yn = (yn < 0.f || yn > y_max);
bool logic_ys = (ys < 0.f || ys > y_max);
*bound_p++ = ((logic_xw || logic_yn) ? 0 : 0xffffffff);
*bound_p++ = ((logic_xe || logic_yn) ? 0 : 0xffffffff);
*bound_p++ = ((logic_xw || logic_ys) ? 0 : 0xffffffff);
*bound_p++ = ((logic_xe || logic_ys) ? 0 : 0xffffffff);
}
size_t cube_size = c * h * w;
size_t spatial_size = h * w;
// compute output
for (int i = 0; i < n; ++i) {
const float* in_n = in + i * cube_size;
float* out_n = out + i * cube_size;
int32_t* coor_n = ctx.workspace_data<int>() + i * spatial_size * 4;
float* dis_n = reinterpret_cast<float*>(coor_n) + coor_size * 4;
uint32_t* bound_n = reinterpret_cast<uint32_t*>(dis_n) + coor_size * 4;
#pragma omp parallel for
for (int j = 0; j < c; ++j) {
int32_t* coor_ptr = coor_n;
float* dis_ptr = dis_n;
uint32_t* bound_ptr = bound_n;
const float* in_c = in_n + j * spatial_size;
float* out_c = out_n + j * spatial_size;
for (int k = 0; k < spatial_size; k++) {
int32x4_t vcoor = vld1q_s32(coor_ptr);
float32x4_t vdis = vld1q_f32(dis_ptr);
int32_t xw = vgetq_lane_s32(vcoor, 0);
int32_t xe = vgetq_lane_s32(vcoor, 1);
int32_t yn = vgetq_lane_s32(vcoor, 2);
int32_t ys = vgetq_lane_s32(vcoor, 3);
uint32x4_t vbound = vld1q_u32(bound_ptr);
float dw = vgetq_lane_f32(vdis, 0);
float de = vgetq_lane_f32(vdis, 1);
float dn = vgetq_lane_f32(vdis, 2);
float ds = vgetq_lane_f32(vdis, 3);
uint32_t wnbound = vgetq_lane_u32(vbound, 0);
uint32_t enbound = vgetq_lane_u32(vbound, 1);
uint32_t wsbound = vgetq_lane_u32(vbound, 2);
uint32_t esbound = vgetq_lane_u32(vbound, 3);
float in_wn = wnbound ? in_c[yn * w + xw] : 0.f;
float in_en = enbound ? in_c[yn * w + xe] : 0.f;
float in_ws = wsbound ? in_c[ys * w + xw] : 0.f;
float in_es = esbound ? in_c[ys * w + xe] : 0.f;
coor_ptr += 4;
dis_ptr += 4;
bound_ptr += 4;
*out_c++ =
ds * (in_wn * de + in_en * dw) + dn * (in_ws * de + in_es * dw);
}
}
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(grid_sampler,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::GridSamplerCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Grid", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class GridSamplerCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::GridSamplerParam;
void PrepareForRun() override;
void Run() override;
virtual ~GridSamplerCompute() = default;
private:
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -41,18 +41,20 @@ void PoolCompute::Run() { ...@@ -41,18 +41,20 @@ void PoolCompute::Run() {
std::vector<int>& paddings = *param.paddings; std::vector<int>& paddings = *param.paddings;
std::string& pooling_type = param.pooling_type; std::string& pooling_type = param.pooling_type;
bool global_pooling = param.global_pooling;
bool exclusive = param.exclusive; bool exclusive = param.exclusive;
bool adaptive = param.adaptive; bool adaptive = param.adaptive;
bool ceil_mode = param.ceil_mode; bool ceil_mode = param.ceil_mode;
bool use_quantizer = param.use_quantizer; bool use_quantizer = param.use_quantizer;
std::string& data_format = param.data_format; std::string& data_format = param.data_format;
bool pads_equal = bool pads_equal = (paddings[0] == paddings[1]) &&
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); (paddings[2] == paddings[3]) &&
(paddings[0] == paddings[2]);
bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && bool kps_equal =
(paddings[0] == paddings[2]); (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && pads_equal;
bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) &&
(ksize[1] == in_dims[3]) && pads_equal;
global_pooling = param.global_pooling || global_pooling;
if (global_pooling) { if (global_pooling) {
for (size_t i = 0; i < ksize.size(); ++i) { for (size_t i = 0; i < ksize.size(); ++i) {
paddings[2 * i] = 0; paddings[2 * i] = 0;
...@@ -83,8 +85,7 @@ void PoolCompute::Run() { ...@@ -83,8 +85,7 @@ void PoolCompute::Run() {
return; return;
} }
} else { } else {
if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal && if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
kps_equal) {
if (pooling_type == "max") { if (pooling_type == "max") {
lite::arm::math::pooling2x2s2_max(din, lite::arm::math::pooling2x2s2_max(din,
dout, dout,
...@@ -110,7 +111,7 @@ void PoolCompute::Run() { ...@@ -110,7 +111,7 @@ void PoolCompute::Run() {
return; return;
} }
} else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 && } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
pads_equal && kps_equal) { kps_equal) {
if (pooling_type == "max") { if (pooling_type == "max") {
lite::arm::math::pooling3x3s1p1_max(din, lite::arm::math::pooling3x3s1p1_max(din,
dout, dout,
...@@ -136,7 +137,7 @@ void PoolCompute::Run() { ...@@ -136,7 +137,7 @@ void PoolCompute::Run() {
return; return;
} }
} else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 && } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
pads_equal && kps_equal) { kps_equal) {
if (pooling_type == "max") { if (pooling_type == "max") {
lite::arm::math::pooling3x3s1p0_max(din, lite::arm::math::pooling3x3s1p0_max(din,
dout, dout,
...@@ -162,7 +163,7 @@ void PoolCompute::Run() { ...@@ -162,7 +163,7 @@ void PoolCompute::Run() {
return; return;
} }
} else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 && } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
pads_equal && kps_equal) { kps_equal) {
if (pooling_type == "max") { if (pooling_type == "max") {
lite::arm::math::pooling3x3s2p0_max(din, lite::arm::math::pooling3x3s2p0_max(din,
dout, dout,
...@@ -188,7 +189,7 @@ void PoolCompute::Run() { ...@@ -188,7 +189,7 @@ void PoolCompute::Run() {
return; return;
} }
} else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 && } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
pads_equal && kps_equal) { kps_equal) {
if (pooling_type == "max") { if (pooling_type == "max") {
lite::arm::math::pooling3x3s2p1_max(din, lite::arm::math::pooling3x3s2p1_max(din,
dout, dout,
......
...@@ -54,7 +54,7 @@ void SplitLodTensorCompute::Run() { ...@@ -54,7 +54,7 @@ void SplitLodTensorCompute::Run() {
} }
lod->clear(); lod->clear();
for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) { for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
VLOG(4) << "mask: " << mask_data[i]; // VLOG(4) << "mask: " << mask_data[i];
if (static_cast<size_t>(mask_data[i]) == t) { if (static_cast<size_t>(mask_data[i]) == t) {
size_t start_idx = i; size_t start_idx = i;
auto lod_and_offset = lite::arm::math::GetSubLoDAndAbsoluteOffset( auto lod_and_offset = lite::arm::math::GetSubLoDAndAbsoluteOffset(
......
...@@ -36,7 +36,7 @@ class StepExecutor { ...@@ -36,7 +36,7 @@ class StepExecutor {
auto &op_desc = *block->template GetOp<cpp::OpDesc>(i); auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
auto op_type = op_desc.Type(); auto op_type = op_desc.Type();
auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type()); auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
VLOG(4) << "while: creating Op [" << op_type << "]"; // VLOG(4) << "while: creating Op [" << op_type << "]";
op_handler->Attach(op_desc, scope); op_handler->Attach(op_desc, scope);
auto hostplace = place_; auto hostplace = place_;
...@@ -51,9 +51,9 @@ class StepExecutor { ...@@ -51,9 +51,9 @@ class StepExecutor {
void Run() { void Run() {
for (auto &op_handler : ops_of_block_) { for (auto &op_handler : ops_of_block_) {
VLOG(4) << op_handler->op_info()->Repr(); // VLOG(4) << op_handler->op_info()->Repr();
op_handler->InferShape(); op_handler->InferShape();
VLOG(4) << "while: infered shape"; // VLOG(4) << "while: infered shape";
op_handler->Run(); op_handler->Run();
} }
} }
......
...@@ -11,6 +11,7 @@ add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${ ...@@ -11,6 +11,7 @@ add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${
add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps}) add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_pool_concat_compute_cuda CUDA extra SRCS sequence_pool_concat_compute.cu DEPS ${lite_kernel_deps})
add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose) add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose)
add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps}) add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector> #include <vector>
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/cuda/match_matrix_tensor_compute.h" #include "lite/kernels/cuda/match_matrix_tensor_compute.h"
...@@ -20,6 +21,54 @@ namespace kernels { ...@@ -20,6 +21,54 @@ namespace kernels {
namespace cuda { namespace cuda {
using Tensor = lite::Tensor; using Tensor = lite::Tensor;
template <typename dtype>
void gpu_transpose(
cublasHandle_t handle, const dtype* src, int M, int N, dtype* dst);
template <>
void gpu_transpose<float>(
cublasHandle_t handle, const float* src, int M, int N, float* dst) {
float alpha = 1.0;
float beta = 0.0;
CUBLAS_CHECK(cublasSgeam(handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
M,
N,
&alpha,
src,
N,
&beta,
dst,
M,
dst,
M));
}
template <typename dtype>
__global__ void padding_out(const dtype* src,
const int* offset,
const int seq_num_r,
const int max_len_r,
const int tl,
const int count,
dtype* dst) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int thread_num = blockDim.x * gridDim.x;
for (tid = threadIdx.x + blockIdx.x * blockDim.x; tid < count;
tid += thread_num) {
int seq_id = tid / (tl * max_len_r);
int tl_id = (tid / (max_len_r)) % tl;
int r_id = tid % max_len_r;
int cur_len = offset[seq_id + 1] - offset[seq_id];
if (r_id < cur_len) {
dst[tid] = src[(offset[seq_id] + r_id) * tl + tl_id];
} else {
dst[tid] = 0.f;
}
}
}
void MatchMatrixTensorCompute::PrepareForRun() { void MatchMatrixTensorCompute::PrepareForRun() {
gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>); gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
} }
...@@ -28,6 +77,7 @@ void MatchMatrixTensorCompute::Run() { ...@@ -28,6 +77,7 @@ void MatchMatrixTensorCompute::Run() {
CHECK(ctx_) << "running context should be set first"; CHECK(ctx_) << "running context should be set first";
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& context = this->ctx_->template As<CUDAContext>(); auto& context = this->ctx_->template As<CUDAContext>();
auto stream = context.exec_stream();
auto* x = param.x; auto* x = param.x;
auto* w = param.w; auto* w = param.w;
...@@ -39,76 +89,74 @@ void MatchMatrixTensorCompute::Run() { ...@@ -39,76 +89,74 @@ void MatchMatrixTensorCompute::Run() {
const auto& offset_l = x->lod()[0]; const auto& offset_l = x->lod()[0];
const auto& offset_r = y->lod()[0]; const auto& offset_r = y->lod()[0];
std::vector<int> offset_r_int(offset_r.size());
std::vector<size_t> top_offset; std::transform(offset_r.begin(),
int top_size = 0; offset_r.end(),
top_offset.push_back(top_size); offset_r_int.begin(),
for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { [](int64_t x) -> int { return static_cast<int>(x); });
int len_l = offset_l[b + 1] - offset_l[b];
int len_r = offset_r[b + 1] - offset_r[b]; int batch = offset_r.size() - 1;
top_size += dim_t * len_l * len_r; int len_l = offset_l[1] - offset_l[0];
top_offset.push_back(top_size); for (int i = 1; i < offset_l.size() - 1; i++) {
int cur_len = offset_l[i + 1] - offset_l[i];
CHECK_EQ(cur_len, len_l)
<< "each sequence of left matrix is the same length";
} }
int max_len_r = 0;
auto* bottom_l_data = x->data<float>(); for (int i = 0; i < offset_r.size() - 1; ++i) {
auto* bottom_r_data = y->data<float>(); int cur_len = offset_r[i + 1] - offset_r[i];
auto* t_data = w->data<float>(); max_len_r = cur_len > max_len_r ? cur_len : max_len_r;
auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kCUDA));
gemm_impl_->init(
false, false, x->dims()[0], dim_t * dim_in, dim_in, &context);
gemm_impl_->run(
1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context);
for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
for (int t = 0; t < dim_t; t++) {
int len_l = offset_l[b + 1] - offset_l[b];
int len_r = offset_r[b + 1] - offset_r[b];
auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
const auto* l_t_data =
bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
gemm_impl_->init(false,
true,
len_l,
len_r,
dim_in,
dim_t * dim_in,
dim_in,
len_r,
&context);
gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context);
}
} }
int batch_size = x->lod()[0].size() - 1; _input_l_transform.Resize({batch, dim_t, dim_in, len_l});
int lod_lv1_size = batch_size * dim_t; _input_l_transform_reorganize.Resize({batch, dim_t, len_l, dim_in});
int lod_lv2_size = x->lod()[0].back() * dim_t; _output_tmp.Resize({batch, max_len_r, dim_t, len_l});
std::vector<size_t> out_lod0(batch_size + 1, 0); out->Resize({batch, dim_t, len_l, max_len_r});
std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
std::vector<size_t> out_lod2(lod_lv2_size + 1, 0); _offset_r.Resize({static_cast<int64_t>(offset_r.size())});
for (int i = 0; i < batch_size; i++) { TargetWrapperCuda::MemcpyAsync(_offset_r.mutable_data<int>(TARGET(kCUDA)),
out_lod0[i + 1] = out_lod0[i] + dim_t; &offset_r_int[0],
int len_l = offset_l[i + 1] - offset_l[i]; sizeof(int) * offset_r.size(),
IoDirection::HtoD,
for (int j = 0; j < dim_t; j++) { stream);
out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
int len_r = offset_r[i + 1] - offset_r[i]; int len_r = offset_r[offset_r.size() - 1];
const float* input_l = x->data<float>();
for (int k = 0; k < len_l; k++) { const float* input_r = y->data<float>();
out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = const float* weight_data = w->data<float>();
out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; float* input_l_transform =
} _input_l_transform.mutable_data<float>(TARGET(kCUDA));
} float* input_l_transform_reorganize =
_input_l_transform_reorganize.mutable_data<float>(TARGET(kCUDA));
float* output_tmp = _output_tmp.mutable_data<float>(TARGET(kCUDA));
float* out_data = out->mutable_data<float>(TARGET(kCUDA));
gemm_impl_->init(true, true, dim_t * dim_in, len_l, dim_in, &context);
gemm_impl_->run(
1.0f, 0.0f, weight_data, input_l, input_l_transform, &context);
for (int i = 0; i < dim_t; ++i) {
int offset = i * dim_in * len_l;
gpu_transpose(gemm_impl_->get_handle(),
input_l_transform + offset,
dim_in,
len_l,
input_l_transform_reorganize + offset);
} }
gemm_impl_->init(false, true, len_r, dim_t * len_l, dim_in, &context);
LoD out_lod; gemm_impl_->run(
out_lod.push_back(top_offset); 1.0f, 0.0f, input_r, input_l_transform_reorganize, output_tmp, &context);
out_lod.push_back(offset_l); int seq_num = offset_r.size() - 1;
out_lod.push_back(offset_r); int count = seq_num * max_len_r * dim_t * len_l;
out->set_lod(out_lod); const int blocks = 512;
const int grids = (count + blocks - 1) / blocks;
padding_out<float><<<grids, blocks, 0, stream>>>(_output_tmp.data<float>(),
_offset_r.data<int>(),
seq_num,
max_len_r,
dim_t * len_l,
count,
out_data);
out->set_lod(y->lod());
} }
} // namespace cuda } // namespace cuda
......
...@@ -34,6 +34,10 @@ class MatchMatrixTensorCompute ...@@ -34,6 +34,10 @@ class MatchMatrixTensorCompute
private: private:
std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_; std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
lite::Tensor _input_l_transform;
lite::Tensor _input_l_transform_reorganize;
lite::Tensor _output_tmp;
lite::Tensor _offset_r;
}; };
} // namespace cuda } // namespace cuda
......
...@@ -16,92 +16,6 @@ namespace paddle { ...@@ -16,92 +16,6 @@ namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace cuda { namespace cuda {
template <typename T>
static void anakin_NV_gemv(cublasHandle_t handle,
const bool TransA,
const int M,
const int N,
const T alpha,
const T* A,
const T* x,
const T beta,
T* y);
template <>
void anakin_NV_gemv<float>(cublasHandle_t handle,
const bool TransA,
const int M,
const int N,
const float alpha,
const float* A,
const float* x,
const float beta,
float* y) {
cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
CUBLAS_CHECK(
cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
}
template <typename T>
static void anakin_NV_gemm(cublasHandle_t handle,
const bool TransA,
const bool TransB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const T* B,
const T beta,
T* C);
template <>
void anakin_NV_gemm<float>(cublasHandle_t handle,
const bool TransA,
const bool TransB,
const int M,
const int N,
const int K,
const float alpha,
const float* A,
const float* B,
const float beta,
float* C) {
// Note that cublas follows fortran order.
int lda = (!TransA /* == CblasNoTrans*/) ? K : M;
int ldb = (!TransB /* == CblasNoTrans*/) ? N : K;
cublasOperation_t cuTransA =
(!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
CUBLAS_CHECK(cublasSgemm(handle,
cuTransB,
cuTransA,
N,
M,
K,
&alpha,
B,
ldb,
A,
lda,
&beta,
C,
N));
}
template <>
void anakin_NV_gemm<char>(cublasHandle_t handle,
const bool TransA,
const bool TransB,
const int M,
const int N,
const int K,
const char alpha,
const char* A,
const char* B,
const char beta,
char* C) {
LOG(FATAL) << "int8 gemm is not implemented";
}
template <typename T> template <typename T>
static __global__ void add_bias(int n, static __global__ void add_bias(int n,
...@@ -115,6 +29,11 @@ static __global__ void add_bias(int n, ...@@ -115,6 +29,11 @@ static __global__ void add_bias(int n,
} }
} }
template <typename T>
void SearchFcCompute<T>::PrepareForRun() {
gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
}
template <typename T> template <typename T>
void SearchFcCompute<T>::Run() { void SearchFcCompute<T>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -132,22 +51,10 @@ void SearchFcCompute<T>::Run() { ...@@ -132,22 +51,10 @@ void SearchFcCompute<T>::Run() {
const T* weight = w_tensor->data<T>(); const T* weight = w_tensor->data<T>();
const Tensor* b_tensor = param.b; const Tensor* b_tensor = param.b;
const T* bias = b_tensor->data<T>(); const T* bias = b_tensor->data<T>();
cublasCreate(&_handle);
if (_M == 1 && _K > 50000) { CHECK(gemm_impl_->init(false, true, _M, _N, _K, &ctx));
anakin_NV_gemv<T>(_handle, false, _N, _K, (T)1, weight, din, (T)0, dout); gemm_impl_->run(1.0f, 0.0f, din, weight, dout, &ctx);
} else {
anakin_NV_gemm<T>(_handle,
false,
!_flag_trans_weights,
_M,
_N,
_K,
(T)1,
din,
weight,
(T)0,
dout);
}
int total_size = _M * _N; int total_size = _M * _N;
add_bias<T><<<CUDA_GET_BLOCKS(total_size), CUDA_NUM_THREADS, 0, stream>>>( add_bias<T><<<CUDA_GET_BLOCKS(total_size), CUDA_NUM_THREADS, 0, stream>>>(
total_size, _N, bias, dout); total_size, _N, bias, dout);
......
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
#pragma once #pragma once
#include <cudnn.h> #include <cudnn.h>
#include <memory>
#include "lite/backends/cuda/cuda_utils.h" #include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/gemm.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
...@@ -34,16 +36,15 @@ template <typename T> ...@@ -34,16 +36,15 @@ template <typename T>
class SearchFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> { class SearchFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public: public:
using param_t = operators::SearchFcParam; using param_t = operators::SearchFcParam;
void PrepareForRun() override;
void Run() override; void Run() override;
virtual ~SearchFcCompute() = default; virtual ~SearchFcCompute() = default;
private: private:
bool _flag_trans_weights{false}; std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_{nullptr};
int _M; int _M;
int _K; int _K;
int _N; int _N;
cublasHandle_t _handle;
bool _is_continue_buf{true};
}; };
} // namespace cuda } // namespace cuda
......
...@@ -22,43 +22,44 @@ namespace lite { ...@@ -22,43 +22,44 @@ namespace lite {
namespace kernels { namespace kernels {
namespace cuda { namespace cuda {
const int CUDA_NUM_THREADS = 512; template <typename dtype>
__global__ void concat_impl_cuda(const int nthreads,
template <typename T> const dtype* in_data,
inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs) { const int num_concats,
std::vector<size_t> result; const int concat_size,
result.resize(xs[0]->lod()[0].size()); const int top_concat_axis,
const int bottom_concat_axis,
for (size_t i = 1; i < result.size(); ++i) { const int offset_concat_axis,
size_t sum = 0; dtype* out_data) {
for (size_t j = 0; j < xs.size(); ++j) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
auto& x_lod = xs[j]->lod()[0]; index += blockDim.x * gridDim.x) {
sum += x_lod[i]; const int total_concat_size = concat_size * bottom_concat_axis;
} const int concat_num = index / total_concat_size;
result[i] = sum; const int concat_index = index % total_concat_size;
const int top_index =
concat_index +
(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
out_data[top_index] = in_data[index];
} }
LoD lod;
lod.emplace_back(result);
return lod;
} }
template <typename Dtype> template <typename dtype>
__global__ void ker_sequence_concat(Dtype* out_data, __global__ void concat_impl_2d_impl(const int inner_size,
const uint64_t* in_locate_data, const int num_concats,
const int* o2i_map, const dtype* in_data,
const int* o2i_w_map, const int concat_size,
const int seq_num, const int out_concat_axis,
const int emb_size, const int offset_concat_axis,
const int count) { dtype* out_data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx_inner = threadIdx.x + blockIdx.x * blockDim.x;
for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) { int idx_outer = threadIdx.y + blockIdx.y * blockDim.y;
int emb_id = tid % emb_size;
int word_id = tid / emb_size; if (idx_inner < inner_size && idx_outer < num_concats) {
int input_id = o2i_map[word_id]; int idx_input = idx_outer * inner_size + idx_inner;
int cur_work_id = o2i_w_map[word_id]; int idx_output =
const Dtype* in_data = reinterpret_cast<const Dtype*>( (idx_outer * out_concat_axis + offset_concat_axis) * concat_size +
reinterpret_cast<uintptr_t>(in_locate_data[input_id])); idx_inner;
out_data[tid] = in_data[cur_work_id * emb_size + emb_id]; out_data[idx_output] = in_data[idx_input];
} }
} }
...@@ -66,73 +67,75 @@ void SequenceConcatCompute::Run() { ...@@ -66,73 +67,75 @@ void SequenceConcatCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>(); auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream(); auto stream = ctx.exec_stream();
float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
int seq_num = param.X[0]->lod()[0].size() - 1; const int BLOCK_SIZE = 32;
const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0]; const int axis = 1;
std::vector<uint64_t> in_locate_vec; int num_concats = param.X[0]->dims().count(0, axis);
for (size_t i = 0; i < param.X.size(); ++i) { int concat_input_size =
in_locate_vec.push_back( param.X[0]->dims().count(axis + 1, param.X[0]->dims().size());
reinterpret_cast<uintptr_t>(param.X[i]->data<float>()));
}
in_locate_tensor.Resize({static_cast<int64_t>(in_locate_vec.size())});
std::vector<int> out2in_map; int input_size = param.X.size();
std::vector<int> out2in_word_map; std::vector<std::vector<int64_t>> shapes_in(input_size);
for (int i = 0; i < seq_num; ++i) { for (int i = 0; i < input_size; ++i) {
for (int j = 0; j < param.X.size(); ++j) { shapes_in[i] = param.X[i]->dims().Vectorize();
auto offset = param.X[j]->lod()[0]; }
int cur_len = offset[i + 1] - offset[i]; std::vector<int64_t> shape_out = shapes_in[0];
for (int k = 0; k < cur_len; ++k) {
out2in_map.push_back(j); // compute output shape
out2in_word_map.push_back(offset[i] + k); for (int i = 1; i < input_size; ++i) {
for (int j = 0; j < shapes_in[i].size(); ++j) {
if (j == axis) {
continue;
} else if (shapes_in[i][j] != -1) {
CHECK_EQ(shape_out[j], shapes_in[i][j])
<< "All inputs must have the same shape, except at concat_axis.";
} }
} }
shape_out[axis] += shapes_in[i][axis];
} }
int word_num = out2in_map.size();
out2in_map_tensor.Resize({word_num});
out2in_word_map_tensor.Resize({word_num});
int* gpu_o2i_map_data = out2in_map_tensor.mutable_data<int>(TARGET(kCUDA));
int* gpu_o2i_w_map_data =
out2in_word_map_tensor.mutable_data<int>(TARGET(kCUDA));
uint64_t* gpu_in_locate_data =
in_locate_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data, param.Out->Resize(shape_out);
out2in_map.data(), float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
sizeof(int) * out2in_map.size(), int offset_concat_axis = 0;
IoDirection::HtoD, const int out_concat_axis = shape_out[axis];
stream);
TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data, for (int i = 0; i < input_size; ++i) {
out2in_word_map.data(), std::vector<int64_t> in_shape = param.X[i]->dims().Vectorize();
sizeof(int) * out2in_word_map.size(), const auto* in_data = param.X[i]->data<float>();
IoDirection::HtoD, const int in_concat_axis = in_shape[axis];
stream); const int in_concat_size = in_concat_axis * concat_input_size;
TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data, const int nthreads = in_concat_size * num_concats;
in_locate_vec.data(), float ratio = static_cast<float>(in_concat_size) / num_concats;
sizeof(uint64_t) * in_locate_vec.size(), bool is_balance = (ratio > 0.1 && ratio < 10);
IoDirection::HtoD, if (is_balance) {
stream); int block_x = BLOCK_SIZE;
int block_y = BLOCK_SIZE;
param.Out->set_lod(ConcatLoD<float>(param.X)); int grid_x = (in_concat_size + block_x - 1) / block_x;
int grid_y = (num_concats + block_y - 1) / block_y;
int count = param.X[0]->numel(); dim3 block(block_x, block_y);
for (int i = 1; i < param.X.size(); ++i) { dim3 grid(grid_x, grid_y);
count += param.X[i]->numel(); concat_impl_2d_impl<float><<<grid, block, 0, stream>>>(in_concat_size,
num_concats,
in_data,
concat_input_size,
out_concat_axis,
offset_concat_axis,
out_data);
} else {
int grid = (nthreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
concat_impl_cuda<float><<<grid, BLOCK_SIZE, 0, stream>>>(
nthreads,
in_data,
num_concats,
concat_input_size,
out_concat_axis,
in_concat_axis,
offset_concat_axis,
out_data);
}
offset_concat_axis += in_concat_axis;
} }
param.Out->set_lod(param.X[0]->lod());
int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
ker_sequence_concat<float><<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
out_data,
gpu_in_locate_data,
gpu_o2i_map_data,
gpu_o2i_w_map_data,
seq_num,
emb_size,
count);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
} }
} // namespace cuda } // namespace cuda
......
...@@ -27,11 +27,6 @@ class SequenceConcatCompute ...@@ -27,11 +27,6 @@ class SequenceConcatCompute
void Run() override; void Run() override;
virtual ~SequenceConcatCompute() = default; virtual ~SequenceConcatCompute() = default;
private:
lite::Tensor out2in_map_tensor;
lite::Tensor out2in_word_map_tensor;
lite::Tensor in_locate_tensor;
}; };
} // namespace cuda } // namespace cuda
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h"
#include "lite/kernels/cuda/sequence_pool_concat_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename Dtype>
__global__ void sequence_pool_concat(const uint64_t* input_locate_data,
const int* pool_type_list,
Dtype* output_data,
const int* offset,
int batch,
int in_num,
int in_dim) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int em_id = tid % in_dim;
int in_id = (tid / in_dim) % in_num;
int seq_id = tid / (in_dim * in_num);
if (seq_id >= batch) {
return;
}
Dtype* out_data = output_data + tid;
int offset_id = in_id * (batch + 1) + seq_id;
if (pool_type_list[in_id] == 4) { // last
const Dtype* in_data =
reinterpret_cast<const Dtype*>(
reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
em_id;
output_data[tid] = in_data[(offset[offset_id + 1] - 1) * in_dim];
} else if (pool_type_list[in_id] == 6) { // max
const Dtype* in_data =
reinterpret_cast<const Dtype*>(
reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
em_id + offset[offset_id] * in_dim;
Dtype max = in_data[0];
for (int i = 1; i < offset[offset_id + 1] - offset[offset_id]; i++) {
Dtype cur_data = in_data[i * in_dim];
max = cur_data > max ? cur_data : max;
}
output_data[tid] = max;
} else {
return;
}
}
template <typename Dtype>
__global__ void sequence_pool_concat(const uint64_t* input_locate_data,
const int* pool_type_list,
Dtype* output_data,
const int* offset,
int batch,
int in_num,
const int* out_offset,
const int* out_id_seq_map_data,
int out_dim) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int em_id = tid % out_dim;
int seq_id = tid / out_dim;
int in_id = out_id_seq_map_data[em_id];
em_id = em_id - out_offset[in_id];
int in_dim = out_offset[in_id + 1] - out_offset[in_id];
if (seq_id >= batch) {
return;
}
Dtype* out_data = output_data + tid;
int offset_id = in_id * (batch + 1) + seq_id;
if (pool_type_list[in_id] == 4) { // last
const Dtype* in_data =
reinterpret_cast<const Dtype*>(
reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
em_id;
output_data[tid] = in_data[(offset[offset_id + 1] - 1) * in_dim];
} else if (pool_type_list[in_id] == 6) { // max
const Dtype* in_data =
reinterpret_cast<const Dtype*>(
reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
em_id + offset[offset_id] * in_dim;
Dtype max = in_data[0];
for (int i = 1; i < offset[offset_id + 1] - offset[offset_id]; i++) {
Dtype cur_data = in_data[i * in_dim];
max = cur_data > max ? cur_data : max;
}
output_data[tid] = max;
} else {
return;
}
}
void SequencePoolConcatCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
int in_num = param.X.size();
std::vector<int64_t> shape({in_num, 1, 1, 1});
_in_offset_tensor.Resize(shape);
_in_ptr_tensor.Resize(shape);
_in_pool_type_tensor.Resize(shape);
int* in_pool_type_data =
_in_pool_type_tensor.mutable_data<int>(TARGET(kCUDA));
std::vector<int> pool_type_list;
for (auto type : param.pool_type) {
if (type == "AVERAGE") {
pool_type_list.push_back(1);
} else if (type == "SUM") {
pool_type_list.push_back(2);
} else if (type == "SQRT") {
pool_type_list.push_back(3);
} else if (type == "LAST") {
pool_type_list.push_back(4);
} else if (type == "FIRST") {
pool_type_list.push_back(5);
} else if (type == "MAX") {
pool_type_list.push_back(6);
} else {
LOG(ERROR) << "pool type " << type << " is not supoorted.";
}
}
_is_in_same_len = true;
int in_len = param.X[0]->dims().count(1, param.X[0]->dims().size());
std::vector<int> out_id_seq_map_list;
std::vector<int> out_offset_list;
int total_len = 0;
out_offset_list.push_back(total_len);
for (int i = 0; i < in_num; ++i) {
int cur_len = param.X[i]->dims().count(1, param.X[i]->dims().size());
_is_in_same_len = _is_in_same_len && in_len == cur_len;
for (int k = 0; k < cur_len; ++k) {
out_id_seq_map_list.push_back(i);
}
total_len += cur_len;
out_offset_list.push_back(total_len);
}
std::vector<int64_t> out_id_seq_map_shape({total_len, 1, 1, 1});
std::vector<int64_t> out_offset_shape({in_num + 1, 1, 1, 1});
_out_offset_tensor.Resize(out_offset_shape);
_out_id_seq_map_tensor.Resize(out_id_seq_map_shape);
int* out_offset_data = _out_offset_tensor.mutable_data<int>(TARGET(kCUDA));
int* out_id_seq_map_data =
_out_id_seq_map_tensor.mutable_data<int>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(in_pool_type_data,
&pool_type_list[0],
sizeof(int) * param.X.size(),
IoDirection::HtoD,
stream);
TargetWrapperCuda::MemcpyAsync(out_offset_data,
&out_offset_list[0],
sizeof(int) * out_offset_list.size(),
IoDirection::HtoD,
stream);
TargetWrapperCuda::MemcpyAsync(out_id_seq_map_data,
&out_id_seq_map_list[0],
sizeof(int) * out_id_seq_map_list.size(),
IoDirection::HtoD,
stream);
cudaStreamSynchronize(stream);
}
void SequencePoolConcatCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
auto& inputs = param.X;
auto offset = inputs[0]->lod()[0];
int batch = offset.size() - 1;
CHECK_GE(offset.size(), 1);
std::vector<int> all_offset;
for (int i = 0; i < inputs.size(); ++i) {
auto it = all_offset.end();
auto cur_offset = inputs[i]->lod()[0];
all_offset.insert(it, cur_offset.begin(), cur_offset.end());
}
int total_size = all_offset.size();
std::vector<int64_t> offset_shape({total_size, 1, 1, 1});
_in_offset_tensor.Resize(offset_shape);
int* offset_data = _in_offset_tensor.mutable_data<int>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(offset_data,
&all_offset[0],
sizeof(int) * all_offset.size(),
IoDirection::HtoD,
stream);
std::vector<uint64_t> in_locate_vec;
for (int i = 0; i < inputs.size(); ++i) {
in_locate_vec.push_back(
reinterpret_cast<uintptr_t>(inputs[i]->data<float>()));
}
uint64_t* in_locate_data =
_in_ptr_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(in_locate_data,
&in_locate_vec[0],
sizeof(uint64_t) * inputs.size(),
IoDirection::HtoD,
stream);
const int* in_pool_type_data = _in_pool_type_tensor.data<int>();
const int* out_id_seq_map_data = _out_id_seq_map_tensor.data<int>();
const int* out_offset_data = _out_offset_tensor.data<int>();
int count = param.Out->numel();
int in_dim = inputs[0]->numel() / inputs[0]->dims()[0];
float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
int in_num = inputs.size();
if (_is_in_same_len) {
sequence_pool_concat<
float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
in_locate_data,
in_pool_type_data,
out_data,
offset_data,
batch,
in_num,
in_dim);
} else {
int out_dim = param.Out->numel() / param.Out->dims()[0];
sequence_pool_concat<
float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
in_locate_data,
in_pool_type_data,
out_data,
offset_data,
batch,
in_num,
out_offset_data,
out_id_seq_map_data,
out_dim);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(sequence_pool_concat,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::SequencePoolConcatCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class SequencePoolConcatCompute
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::SequencePoolConcatParam;
void Run() override;
void PrepareForRun() override;
virtual ~SequencePoolConcatCompute() = default;
private:
lite::Tensor _in_offset_tensor;
lite::Tensor _in_ptr_tensor;
lite::Tensor _in_pool_type_tensor;
lite::Tensor _out_offset_tensor;
lite::Tensor _out_id_seq_map_tensor;
bool _is_in_same_len;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -26,6 +26,8 @@ __global__ void topk_avg_pooling_kernel_by_row_improve( ...@@ -26,6 +26,8 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
const Dtype *input, const Dtype *input,
const int *gpu_input_offset_l, const int *gpu_input_offset_l,
const int *gpu_input_offset_r, const int *gpu_input_offset_r,
const int row_max,
const int col_max,
const int topk_size, const int topk_size,
const int *topks, const int *topks,
const int feat_map_num) { const int feat_map_num) {
...@@ -33,20 +35,17 @@ __global__ void topk_avg_pooling_kernel_by_row_improve( ...@@ -33,20 +35,17 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x]; // 8 gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x]; // 8
int col = gpu_input_offset_r[blockIdx.x + 1] - int col = gpu_input_offset_r[blockIdx.x + 1] -
gpu_input_offset_r[blockIdx.x]; // 30 gpu_input_offset_r[blockIdx.x]; // 30
int max_k = topks[topk_size - 1]; int max_k = topks[topk_size - 1];
max_k = max_k < col ? max_k : col; max_k = max_k < col ? max_k : col;
extern __shared__ Dtype smem[]; // H*W extern __shared__ Dtype smem[]; // H*W
const Dtype *fm_row_in_data = input; const Dtype *fm_row_in_data = input +
for (int i = 0; i < blockIdx.x; ++i) { blockIdx.x * row_max * feat_map_num * col_max +
int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i]; blockIdx.y * row_max * col_max;
int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i];
fm_row_in_data += tmp_row * feat_map_num * tmp_col;
}
fm_row_in_data += blockIdx.y * row * col;
for (int i = threadIdx.x; i < row * col; i += blockDim.x) { for (int i = threadIdx.x; i < row * col_max; i += blockDim.x) {
smem[i] = fm_row_in_data[i]; smem[i] = fm_row_in_data[i];
} }
__syncthreads(); __syncthreads();
...@@ -57,13 +56,13 @@ __global__ void topk_avg_pooling_kernel_by_row_improve( ...@@ -57,13 +56,13 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
(gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size + (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size +
blockIdx.y * topk_size; blockIdx.y * topk_size;
Dtype *smem_start_col = smem + idx * col; Dtype *smem_start_col = smem + idx * col_max;
int counter = max_k; // topk_size; int counter = max_k; // topk_size;
Dtype last_max_val = -20000.0; Dtype last_max_val = -20000.0;
while (counter) { while (counter) {
Dtype max_val = -10000.0; Dtype max_val = -10000.0;
int max_pos = 0; int max_pos = 0; // -1;
int m = 0; int m = 0;
for (; m < col; m++) { for (; m < col; m++) {
Dtype cur_data = smem_start_col[m]; Dtype cur_data = smem_start_col[m];
...@@ -77,6 +76,7 @@ __global__ void topk_avg_pooling_kernel_by_row_improve( ...@@ -77,6 +76,7 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
max_val = last_max_val; max_val = last_max_val;
} }
smem_start_col[max_pos] = -10000000.0; smem_start_col[max_pos] = -10000000.0;
int i = max_k - counter; int i = max_k - counter;
for (int c = 0; c < topk_size; c++) { for (int c = 0; c < topk_size; c++) {
if (i <= topks[c] - 1) { if (i <= topks[c] - 1) {
...@@ -98,22 +98,18 @@ void SequenceTopkAvgPoolingCompute<T>::Run() { ...@@ -98,22 +98,18 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
auto &param = this->Param<param_t>(); auto &param = this->Param<param_t>();
auto &ctx = this->ctx_->template As<CUDAContext>(); auto &ctx = this->ctx_->template As<CUDAContext>();
auto cuda_stream = ctx.exec_stream(); auto cuda_stream = ctx.exec_stream();
int topk_num = param.topks.size();
lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
_top_ks.Resize(top_ks_shape);
cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
&param.topks[0],
sizeof(int) * topk_num,
cudaMemcpyHostToDevice,
cuda_stream);
int width_offset_len = param.COLUMN->lod()[0].size(); CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0)
lite::DDim width_offset_shape( << "X sequence offset is not valid";
std::vector<int64_t>{width_offset_len, 1, 1, 1}); CHECK(param.ROW->lod().size() > 0 && param.ROW->lod()[0].size() > 0)
<< "ROW sequence offset is not valid";
int width_offset_len = param.X->lod()[0].size();
lite::DDim width_offset_shape(std::vector<int64_t>{width_offset_len});
_width_offset.Resize(width_offset_shape); _width_offset.Resize(width_offset_shape);
std::vector<int> width_lod_0(width_offset_len, 0); std::vector<int> width_lod_0(width_offset_len, 0);
for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) { for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
width_lod_0[i] = static_cast<int>(param.COLUMN->lod()[0][i]); width_lod_0[i] = static_cast<int>(param.X->lod()[0][i]);
} }
cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)), cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)),
&width_lod_0[0], &width_lod_0[0],
...@@ -122,8 +118,7 @@ void SequenceTopkAvgPoolingCompute<T>::Run() { ...@@ -122,8 +118,7 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
cuda_stream); cuda_stream);
int height_offset_len = param.ROW->lod()[0].size(); int height_offset_len = param.ROW->lod()[0].size();
lite::DDim height_offset_shape( lite::DDim height_offset_shape(std::vector<int64_t>{height_offset_len});
std::vector<int64_t>{height_offset_len, 1, 1, 1});
_height_offset.Resize(height_offset_shape); _height_offset.Resize(height_offset_shape);
std::vector<int> height_lod_0(height_offset_len, 0); std::vector<int> height_lod_0(height_offset_len, 0);
for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) { for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) {
...@@ -139,39 +134,42 @@ void SequenceTopkAvgPoolingCompute<T>::Run() { ...@@ -139,39 +134,42 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
Tensor *out_tensor = param.Out; Tensor *out_tensor = param.Out;
const T *in_data = x_tensor->data<T>(); const T *in_data = x_tensor->data<T>();
T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA)); T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
TargetWrapperCuda::MemsetAsync(out_tensor->mutable_data<T>(TARGET(kCUDA)), TargetWrapperCuda::MemsetAsync(
0, out_data, 0, sizeof(T) * param.Out->numel(), cuda_stream);
sizeof(T) * out_tensor->numel(),
cuda_stream); int topk_num = param.topks.size();
lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
_top_ks.Resize(top_ks_shape);
cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
&param.topks[0],
sizeof(int) * topk_num,
cudaMemcpyHostToDevice,
cuda_stream);
int num = param.ROW->lod()[0].size() - 1; int num = param.X->dims()[0];
int channel = param.channel_num; int channel = param.X->dims()[1];
int height = param.X->dims()[2];
int width = param.X->dims()[3];
const int *height_offset = _height_offset.data<int>(); const int *height_offset = _height_offset.data<int>();
const int *width_offset = _width_offset.data<int>(); const int *width_offset = _width_offset.data<int>();
int feat_map_size = 0; int feat_map_size = height * width;
for (size_t i = 0; i < height_lod_0.size() - 1; ++i) {
int height = height_lod_0[i + 1] - height_lod_0[i];
int width = width_lod_0[i + 1] - width_lod_0[i];
if (height * width > feat_map_size) {
feat_map_size = height * width;
}
}
dim3 blocks(num, channel); dim3 blocks(num, channel);
dim3 threads(32, 1); dim3 threads(32, 1);
topk_avg_pooling_kernel_by_row_improve< topk_avg_pooling_kernel_by_row_improve<
T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>( T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
out_data, out_data,
in_data, in_data,
height_offset, height_offset,
width_offset, width_offset,
height,
width,
param.topks.size(), param.topks.size(),
_top_ks.data<int>(), _top_ks.data<int>(),
param.channel_num); param.channel_num);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
} }
} // namespace cuda } // namespace cuda
......
...@@ -21,6 +21,8 @@ namespace kernels { ...@@ -21,6 +21,8 @@ namespace kernels {
namespace cuda { namespace cuda {
using Tensor = lite::Tensor; using Tensor = lite::Tensor;
const int CUDA_NUM_THREADS = 512;
extern __shared__ char tile[]; extern __shared__ char tile[];
template <typename dtype> template <typename dtype>
__global__ void sharemem_softmax_kernel(int total_size, __global__ void sharemem_softmax_kernel(int total_size,
...@@ -149,6 +151,15 @@ __global__ void softmax_divid_output_kernel(int total_size, ...@@ -149,6 +151,15 @@ __global__ void softmax_divid_output_kernel(int total_size,
} }
} }
void SoftmaxCompute::PrepareForRun() {
int device_id;
cudaGetDevice(&device_id);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device_id);
sharedmem_size = deviceProp.sharedMemPerBlock;
max_dimsize = sharedmem_size / sizeof(float) / CUDA_NUM_THREADS;
}
void SoftmaxCompute::Run() { void SoftmaxCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>(); auto& ctx = this->ctx_->template As<CUDAContext>();
...@@ -165,18 +176,10 @@ void SoftmaxCompute::Run() { ...@@ -165,18 +176,10 @@ void SoftmaxCompute::Run() {
int total_threads = inner_num * outer_num; int total_threads = inner_num * outer_num;
int axis_size = x_dims[axis]; int axis_size = x_dims[axis];
int device_id; const int threads = CUDA_NUM_THREADS;
const int threads = 512;
const int blocks = (total_threads + threads - 1) / threads; const int blocks = (total_threads + threads - 1) / threads;
cudaGetDevice(&device_id);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device_id);
size_t sharedmem_size = deviceProp.sharedMemPerBlock;
int max_dimsize = sharedmem_size / sizeof(float) / threads;
auto input_data = param.x->data<float>(); auto input_data = param.x->data<float>();
auto output_data = param.output->mutable_data<float>(TARGET(kCUDA)); auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
TargetWrapperCuda::MemsetSync(
output_data, 0, param.output->numel() * sizeof(float));
if (axis_size <= max_dimsize) { if (axis_size <= max_dimsize) {
int use_sharemem_size = axis_size * threads * sizeof(float); int use_sharemem_size = axis_size * threads * sizeof(float);
sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>( sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
......
...@@ -25,8 +25,14 @@ class SoftmaxCompute ...@@ -25,8 +25,14 @@ class SoftmaxCompute
public: public:
using param_t = operators::SoftmaxParam; using param_t = operators::SoftmaxParam;
void PrepareForRun() override;
void Run() override; void Run() override;
virtual ~SoftmaxCompute() = default; virtual ~SoftmaxCompute() = default;
private:
size_t sharedmem_size;
int num_threads;
int max_dimsize;
}; };
} // namespace cuda } // namespace cuda
......
...@@ -25,224 +25,83 @@ namespace lite { ...@@ -25,224 +25,83 @@ namespace lite {
namespace kernels { namespace kernels {
namespace cuda { namespace cuda {
const int CUDA_NUM_THREADS = 512; inline int ConvOutputSize(int input_size,
int filter_size,
template <typename Dtype> int dilation,
__global__ void var_im2col_gpu_kernel(const int n, int pad_left,
const Dtype* data_im, int pad_right,
const int height, int stride) {
const int width, const int dkernel = dilation * (filter_size - 1) + 1;
const int kernel_h, int output_size =
const int kernel_w, (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
const int pad_h,
const int pad_w, return output_size;
const int stride_h,
const int stride_w,
const int height_col,
const int width_col,
Dtype* data_col) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int index = idx; index < n; index += blockDim.x * gridDim.x) {
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
Dtype* data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const Dtype* data_im_ptr = data_im;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
int h_im = h_offset + i;
int w_im = w_offset + j;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
? data_im_ptr[i * width + j]
: 0;
data_col_ptr += height_col * width_col;
}
}
}
} }
void VarConv2DCompute::var_im2col(const cudaStream_t& stream) { void VarConv2DCompute::PrepareForRun() {
auto& context = this->ctx_->template As<CUDAContext>();
auto stream = context.exec_stream();
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
int input_channel = param.input_channel; conv_param_.x = const_cast<lite::Tensor*>(param.X);
int kernel_h = param.kernel_h; conv_param_.var_length = true;
int kernel_w = param.kernel_w;
int stride_h = param.stride_h; conv_param_.paddings.reset(new std::vector<int>);
int stride_w = param.stride_w; conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
// auto* in_row = param.ROW; conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
// auto* in_col = param.COLUMN; conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
const auto* input = param.X; conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
auto* col = param.Col; conv_param_.dilations.reset(new std::vector<int>);
conv_param_.dilations->push_back(1);
int batch = input->lod()[0].size() - 1; conv_param_.dilations->push_back(1);
const auto& bottom_offset = input->lod()[0]; conv_param_.strides[0] = param.stride_h;
// 2-D lod info. conv_param_.strides[1] = param.stride_w;
// const auto& offset_x = in_col->lod()[0]; conv_param_.filter = const_cast<lite::Tensor*>(param.W);
// const auto& offset_y = in_row->lod()[0]; conv_param_.filter->Resize({param.output_channel,
const auto& offset_y = param.X->lod()[1]; param.input_channel,
const auto& offset_x = param.X->lod()[2]; param.kernel_h,
// top offset is the whole size of each data sample param.kernel_w});
std::vector<uint64_t> top_offset;
int top_size = 0; conv_param_.output = param.Out;
top_offset.push_back(top_size); std::vector<int64_t> output_shape(
for (int b = 0; b < batch; ++b) { {conv_param_.x->dims()[0], param.output_channel});
int width = offset_x[b + 1] - offset_x[b]; for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
int height = offset_y[b + 1] - offset_y[b]; output_shape.push_back(
int top_im_x = 0; ConvOutputSize(conv_param_.x->dims()[i + 2],
if (width == 0) { conv_param_.filter->dims()[i + 2],
top_im_x = 0; (*conv_param_.dilations.get())[i],
} else { (*conv_param_.paddings.get())[i * 2],
top_im_x = (width - 1) / stride_w + 1; (*conv_param_.paddings.get())[i * 2 + 1],
} conv_param_.strides[i]));
int top_im_y = 0;
if (height == 0) {
top_im_y = 0;
} else {
top_im_y = (height - 1) / stride_h + 1;
}
int top_x = top_im_x * top_im_y;
int top_y = input_channel * kernel_h * kernel_w;
top_size += top_y * top_x;
top_offset.push_back(top_size);
} }
if (param.fuse_relu) {
LoD col_lod; conv_param_.activation_param.has_active = true;
col_lod.push_back(top_offset); conv_param_.activation_param.active_type = lite_api::ActivationType::kRelu;
col->set_lod(col_lod);
std::vector<int64_t> col_dims_vec{top_size};
col_dims_vec.push_back(1);
col->Resize(col_dims_vec);
auto* top_data = col->mutable_data<float>(TARGET(kCUDA));
const auto* bottom_data = input->data<float>();
for (int b = 0; b < batch; ++b) {
int t_offset = top_offset[b];
int b_offset = bottom_offset[b];
int width = offset_x[b + 1] - offset_x[b];
int height = offset_y[b + 1] - offset_y[b];
if (width == 0 || height == 0) {
continue;
}
int width_col = (width - 1) / stride_w + 1;
int height_col = (height - 1) / stride_h + 1;
const float* data_im = bottom_data + b_offset;
float* data_col = top_data + t_offset;
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int num_kernels = height_col * width_col * input_channel;
const int CUDA_NUM_BLOCKS =
(num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
var_im2col_gpu_kernel<
float><<<CUDA_NUM_BLOCKS, CUDA_NUM_THREADS, 0, stream>>>(
num_kernels,
data_im,
height,
width,
kernel_h,
kernel_w,
((stride_h - 1) * height + kernel_h - 1) / 2,
((stride_w - 1) * width + kernel_w - 1) / 2,
stride_h,
stride_w,
height_col,
width_col,
data_col);
} }
conv_param_.output->Resize({output_shape});
conv_impl_.reset(new lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>);
conv_impl_->init(conv_param_, &context);
} }
void VarConv2DCompute::Run() { void VarConv2DCompute::Run() {
auto& context = this->ctx_->template As<CUDAContext>();
auto stream = context.exec_stream();
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
auto* bottom = param.X; param.Out->set_lod(param.X->lod());
// auto* in_row = param.ROW; std::vector<int64_t> output_shape(
// auto* in_col = param.COLUMN; {conv_param_.x->dims()[0], param.output_channel});
auto* w = param.W; for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
auto* top = param.Out; output_shape.push_back(
auto* col = param.Col; ConvOutputSize(conv_param_.x->dims()[i + 2],
int output_channel = param.output_channel; conv_param_.filter->dims()[i + 2],
int input_channel = param.input_channel; (*conv_param_.dilations.get())[i],
int kernel_h = param.kernel_h; (*conv_param_.paddings.get())[i * 2],
int kernel_w = param.kernel_w; (*conv_param_.paddings.get())[i * 2 + 1],
int stride_h = param.stride_h; conv_param_.strides[i]));
int stride_w = param.stride_w;
var_im2col(stream);
int batch = bottom->lod()[0].size() - 1;
const auto& col_offset = col->lod()[0];
// const auto& offset_x = in_col->lod()[0];
// const auto& offset_y = in_row->lod()[0];
const auto& offset_y = param.X->lod()[1];
const auto& offset_x = param.X->lod()[2];
std::vector<size_t> top_offset;
std::vector<int64_t> height_vector;
std::vector<int64_t> width_vector;
int top_size = 0;
top_offset.push_back(top_size);
for (int b = 0; b < batch; ++b) {
int width = offset_x[b + 1] - offset_x[b];
int height = offset_y[b + 1] - offset_y[b];
int top_im_x = 0;
if (width == 0) {
top_im_x = 0;
} else {
top_im_x = (width - 1) / stride_w + 1;
}
int top_im_y = 0;
if (height == 0) {
top_im_y = 0;
} else {
top_im_y = (height - 1) / stride_h + 1;
}
height_vector.push_back(top_im_y);
width_vector.push_back(top_im_x);
int top_im_size = top_im_y * top_im_x;
top_size += output_channel * top_im_size;
top_offset.push_back(top_size);
} }
conv_param_.output->Resize({output_shape});
LoD top_lod; conv_impl_->create(conv_param_, &context);
top_lod.push_back(top_offset); conv_impl_->run(conv_param_);
top->set_lod(top_lod);
std::vector<int64_t> top_dims_vec{top_size};
top_dims_vec.push_back(1);
top->Resize(top_dims_vec);
auto* top_data = top->mutable_data<float>(TARGET(kCUDA));
const auto* w_data = w->data<float>();
const auto* col_data = col->data<float>();
std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
for (int b = 0; b < batch; ++b) {
int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
if (top_im_size == 0) {
continue;
}
float* out_data = top_data + top_offset[b];
const float* in_data = col_data + col->lod()[0][b];
gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
gemm_impl_->init(false,
false,
w->dims()[0],
height_vector[b] * width_vector[b],
input_channel * kernel_h * kernel_w,
&ctx);
gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx);
}
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
} }
} // namespace cuda } // namespace cuda
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory>
#include "lite/backends/cuda/math/cudnn_conv.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
namespace paddle { namespace paddle {
...@@ -25,10 +27,12 @@ class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> { ...@@ -25,10 +27,12 @@ class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
using param_t = operators::VarConv2DParam; using param_t = operators::VarConv2DParam;
void Run() override; void Run() override;
void PrepareForRun() override;
virtual ~VarConv2DCompute() = default; virtual ~VarConv2DCompute() = default;
private: private:
void var_im2col(const cudaStream_t& stream); mutable operators::ConvParam conv_param_;
std::unique_ptr<lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>> conv_impl_;
}; };
} // namespace cuda } // namespace cuda
......
...@@ -52,7 +52,29 @@ class SequenceConcatCompute ...@@ -52,7 +52,29 @@ class SequenceConcatCompute
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
// auto& param = Param<param_t>();
int64_t batch_size = 0;
int64_t feature_size = 0;
std::vector<int64_t> out_dims;
for (const auto& tensor : param.X) {
const auto x_dims = tensor->dims();
if (out_dims.empty()) {
out_dims = x_dims.Vectorize();
}
batch_size += x_dims[0];
if (feature_size == 0) {
feature_size = x_dims.production() / x_dims[0];
} else {
CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
<< "Inputs of sequence concat must have same feature size";
}
}
if (batch_size < 0) {
batch_size = -1; // Normalize batch size for compile time.
}
out_dims[0] = batch_size;
param.Out->Resize(out_dims);
T* dout = param.Out->mutable_data<T>(); T* dout = param.Out->mutable_data<T>();
std::vector<lite::Tensor> x_in_order; std::vector<lite::Tensor> x_in_order;
......
...@@ -14,6 +14,9 @@ lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_brid ...@@ -14,6 +14,9 @@ lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_brid
lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu}) lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu})
lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_transpose_op_xpu SRCS transpose_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_reshape_op_xpu SRCS reshape_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
set(xpu_subgraph_bridges set(xpu_subgraph_bridges
subgraph_bridge_registry subgraph_bridge_registry
...@@ -26,6 +29,9 @@ set(xpu_subgraph_bridges ...@@ -26,6 +29,9 @@ set(xpu_subgraph_bridges
subgraph_bridge_softmax_op_xpu subgraph_bridge_softmax_op_xpu
subgraph_bridge_mul_op_xpu subgraph_bridge_mul_op_xpu
subgraph_bridge_batch_norm_op_xpu subgraph_bridge_batch_norm_op_xpu
subgraph_bridge_transpose_op_xpu
subgraph_bridge_reshape_op_xpu
subgraph_bridge_layer_norm_op_xpu
CACHE INTERNAL "xpu_subgraph_bridges") CACHE INTERNAL "xpu_subgraph_bridges")
message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}") message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace xpu {
int LayerNormConverter(void* ctx, OpLite* op) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[XPU] Converting " + op_type + "...";
// Get input vars and op attributes
auto x_var_name = op_info->Input("X").front();
auto scale_var_name = op_info->Input("Scale").front();
auto* scale = scope->FindMutableTensor(scale_var_name);
auto bias_var_name = op_info->Input("Bias").front();
auto* bias = scope->FindMutableTensor(bias_var_name);
auto y_var_name = op_info->Output("Y").front();
auto epsilon = op_info->GetAttr<float>("epsilon");
auto axis = op_info->GetAttr<int>("begin_norm_axis");
// Create scale, bias nodes
auto scale_const_node = graph->AddNode(scale_var_name, *scale);
auto bias_const_node = graph->AddNode(bias_var_name, *bias);
// Create node and set params from op
auto layer_norm_node =
graph->builder_.CreateLayerNorm(*graph->GetNode(x_var_name),
*scale_const_node,
*bias_const_node,
axis,
epsilon,
true,
true);
graph->AddNode(y_var_name, graph->builder_.GetField(layer_norm_node, 0));
return SUCCESS;
}
} // namespace xpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(XPU,
layer_norm,
paddle::lite::subgraph::xpu::LayerNormConverter);
...@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(XPU, pool2d); ...@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(XPU, pool2d);
USE_SUBGRAPH_BRIDGE(XPU, softmax); USE_SUBGRAPH_BRIDGE(XPU, softmax);
USE_SUBGRAPH_BRIDGE(XPU, mul); USE_SUBGRAPH_BRIDGE(XPU, mul);
USE_SUBGRAPH_BRIDGE(XPU, batch_norm); USE_SUBGRAPH_BRIDGE(XPU, batch_norm);
USE_SUBGRAPH_BRIDGE(XPU, transpose);
USE_SUBGRAPH_BRIDGE(XPU, transpose2);
USE_SUBGRAPH_BRIDGE(XPU, reshape);
USE_SUBGRAPH_BRIDGE(XPU, reshape2);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/reshape_op.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace xpu {
int ReshapeConverter(void* ctx, OpLite* op) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto scope = op->scope();
auto op_type = op_info->Type();
VLOG(3) << "[XPU] Converting " + op_type + "...";
// Create node and set params from op
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
std::vector<int> shape;
if (op_info->HasInput("ShapeTensor") &&
!op_info->Input("ShapeTensor").empty()) {
for (auto var_name : op_info->Input("ShapeTensor")) {
shape.emplace_back(scope->FindMutableTensor(var_name)->data<int>()[0]);
}
CHECK_GT(shape.size(), 0)
<< "ShapeError: When `shape` in ReshapeOp is a list or tuple "
"which contains Tensor, the shape's size can't be zero. "
"But received shape's size is "
<< shape.size();
} else if (op_info->HasInput("Shape") && !op_info->Input("Shape").empty()) {
auto shape_tensor =
scope->FindMutableTensor(op_info->Input("Shape").front());
auto shape_data = shape_tensor->data<int>();
shape = std::vector<int>(shape_data, shape_data + shape_tensor->numel());
} else if (op_info->HasAttr("shape")) {
shape = op_info->GetAttr<std::vector<int>>("shape");
} else {
LOG(FATAL) << "no new shape for reshape op";
}
auto out_dims =
operators::ValidateShape(shape, scope->FindTensor(x_var_name)->dims());
CHECK(graph->HasNode(x_var_name));
graph->AddNode(out_var_name,
graph->builder_.CreateReshape(*graph->GetNode(x_var_name),
Cvt2ArrayInt(out_dims)));
return SUCCESS;
}
} // namespace xpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(XPU,
reshape2,
paddle::lite::subgraph::xpu::ReshapeConverter);
REGISTER_SUBGRAPH_BRIDGE(XPU,
reshape,
paddle::lite::subgraph::xpu::ReshapeConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace xpu {
int TransposeConverter(void* ctx, OpLite* op) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
VLOG(3) << "[XPU] Converting " + op_type + "...";
// Create node and set params from op
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<std::vector<int>>("axis");
CHECK(graph->HasNode(x_var_name));
graph->AddNode(
out_var_name,
graph->builder_.CreateTranspose(
*graph->GetNode(x_var_name),
Cvt2ArrayInt(std::vector<int64_t>(axis.begin(), axis.end()))));
return SUCCESS;
}
} // namespace xpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(XPU,
transpose,
paddle::lite::subgraph::xpu::TransposeConverter);
REGISTER_SUBGRAPH_BRIDGE(XPU,
transpose2,
paddle::lite::subgraph::xpu::TransposeConverter);
...@@ -125,6 +125,18 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor, ...@@ -125,6 +125,18 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor,
return out_tensor; return out_tensor;
} }
xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const std::vector<int64_t>& input) {
xtcl::Array<xtcl::Integer> output;
for (auto i : input) {
output.push_back(i);
}
return output;
}
xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const DDim& input) {
return Cvt2ArrayInt(input.Vectorize());
}
} // namespace xpu } // namespace xpu
} // namespace subgraph } // namespace subgraph
} // namespace lite } // namespace lite
......
...@@ -47,6 +47,9 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor( ...@@ -47,6 +47,9 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(
PrecisionType in_ptype = PRECISION(kFloat), PrecisionType in_ptype = PRECISION(kFloat),
DataLayoutType in_ltype = DATALAYOUT(kNCHW)); DataLayoutType in_ltype = DATALAYOUT(kNCHW));
xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const std::vector<int64_t>& input);
xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const DDim& input);
} // namespace xpu } // namespace xpu
} // namespace subgraph } // namespace subgraph
} // namespace lite } // namespace lite
......
...@@ -60,9 +60,14 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -60,9 +60,14 @@ int SubgraphEngine::BuildDeviceProgram() {
// Obtain the output nodes of the XPU IR graph and build the graph to XPU // Obtain the output nodes of the XPU IR graph and build the graph to XPU
// runtime // runtime
std::vector<xtcl::xExpr*> output_nodes; std::vector<xtcl::xExpr*> output_nodes;
std::vector<std::string> valid_output_names;
for (auto& output_name : output_names_) { for (auto& output_name : output_names_) {
output_nodes.push_back(graph.GetNode(output_name).get()); if (graph.HasNode(output_name)) {
output_nodes.push_back(graph.GetNode(output_name).get());
valid_output_names.push_back(output_name);
}
} }
CHECK(!valid_output_names.empty()) << "[XPU] no valid output names";
device_program_ = lite::xpu::Device::Global().Build( device_program_ = lite::xpu::Device::Global().Build(
&graph.builder_, &graph.params_, &output_nodes); &graph.builder_, &graph.params_, &output_nodes);
if (device_program_ == nullptr) { if (device_program_ == nullptr) {
...@@ -73,16 +78,16 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -73,16 +78,16 @@ int SubgraphEngine::BuildDeviceProgram() {
// Query and check the dimensions of input and output tensors // Query and check the dimensions of input and output tensors
origin_idims_.resize(input_names_.size()); origin_idims_.resize(input_names_.size());
origin_itensors_.resize(input_names_.size()); origin_itensors_.resize(input_names_.size());
origin_odims_.resize(output_names_.size()); origin_odims_.resize(valid_output_names.size());
origin_otensors_.resize(output_names_.size()); origin_otensors_.resize(valid_output_names.size());
for (int i = 0; i < input_names_.size(); i++) { for (int i = 0; i < input_names_.size(); i++) {
origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
CHECK(origin_itensors_[i]); CHECK(origin_itensors_[i]);
origin_idims_[i] = origin_itensors_[i]->dims(); origin_idims_[i] = origin_itensors_[i]->dims();
VLOG(3) << "[XPU] Input dims[" << i << "]: " << origin_idims_[i]; VLOG(3) << "[XPU] Input dims[" << i << "]: " << origin_idims_[i];
} }
for (int i = 0; i < output_names_.size(); i++) { for (int i = 0; i < valid_output_names.size(); i++) {
origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); origin_otensors_[i] = scope_->FindMutableTensor(valid_output_names[i]);
CHECK(origin_otensors_[i]); CHECK(origin_otensors_[i]);
origin_odims_[i] = origin_otensors_[i]->dims(); origin_odims_[i] = origin_otensors_[i]->dims();
VLOG(3) << "[XPU] Output dims[" << i << "]: " << origin_odims_[i]; VLOG(3) << "[XPU] Output dims[" << i << "]: " << origin_odims_[i];
...@@ -113,7 +118,7 @@ int SubgraphEngine::LaunchDeviceProgram() { ...@@ -113,7 +118,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
device_program_->Run(); device_program_->Run();
VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us"; VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
// Copy the data of output XPU tensor to the buffer of origin output tensors // Copy the data of output XPU tensor to the buffer of origin output tensors
for (size_t i = 0; i < output_names_.size(); i++) { for (size_t i = 0; i < origin_otensors_.size(); i++) {
auto output_ndarray = device_program_->GetOutput(i); auto output_ndarray = device_program_->GetOutput(i);
std::memcpy(origin_otensors_[i]->mutable_data<float>(), std::memcpy(origin_otensors_[i]->mutable_data<float>(),
static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data), static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data),
......
...@@ -49,6 +49,7 @@ add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS}) ...@@ -49,6 +49,7 @@ add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS}) add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS}) add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
# 2.basic ops not used in basic models # 2.basic ops not used in basic models
add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
...@@ -89,6 +90,8 @@ add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${o ...@@ -89,6 +90,8 @@ add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${o
add_operator(reduce_prod_op_lite extra SRCS reduce_prod_op.cc DEPS ${op_DEPS}) add_operator(reduce_prod_op_lite extra SRCS reduce_prod_op.cc DEPS ${op_DEPS})
add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS}) add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS}) add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS})
add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
add_operator(sequence_pool_concat extra SRCS sequence_pool_concat_op.cc DEPS ${op_DEPS})
add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS}) add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS})
add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS}) add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS})
add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS}) add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS})
...@@ -119,7 +122,6 @@ add_operator(greater_than extra SRCS compare_op.cc DEPS ${op_DEPS}) ...@@ -119,7 +122,6 @@ add_operator(greater_than extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(greater_equal extra SRCS compare_op.cc DEPS ${op_DEPS}) add_operator(greater_equal extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(read_from_array_op extra SRCS read_from_array_op.cc DEPS ${op_DEPS}) add_operator(read_from_array_op extra SRCS read_from_array_op.cc DEPS ${op_DEPS})
add_operator(beam_search_op extra SRCS beam_search_op.cc DEPS ${op_DEPS}) add_operator(beam_search_op extra SRCS beam_search_op.cc DEPS ${op_DEPS})
add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
add_operator(lod_reset_op extra SRCS lod_reset_op.cc DEPS ${op_DEPS}) add_operator(lod_reset_op extra SRCS lod_reset_op.cc DEPS ${op_DEPS})
add_operator(is_empty extra SRCS is_empty_op.cc DEPS ${op_DEPS}) add_operator(is_empty extra SRCS is_empty_op.cc DEPS ${op_DEPS})
add_operator(slice_op_lite basic SRCS slice_op.cc DEPS ${op_DEPS}) add_operator(slice_op_lite basic SRCS slice_op.cc DEPS ${op_DEPS})
......
...@@ -52,12 +52,12 @@ inline int ConvOutputSize(int input_size, ...@@ -52,12 +52,12 @@ inline int ConvOutputSize(int input_size,
return output_size; return output_size;
} }
inline void UpdatePaddingAndDilation(std::vector<int>* paddings, void UpdatePaddingAndDilation(std::vector<int>* paddings,
std::vector<int>* dilations, std::vector<int>* dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::string padding_algorithm, const std::string padding_algorithm,
const lite::DDim data_dims, const lite::DDim data_dims,
const lite::DDim& ksize) { const lite::DDim& ksize) {
// when padding_desc is "VALID" or "SAME" // when padding_desc is "VALID" or "SAME"
if (padding_algorithm == "SAME") { if (padding_algorithm == "SAME") {
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
......
...@@ -136,7 +136,13 @@ class ConvOpLite : public OpLite { ...@@ -136,7 +136,13 @@ class ConvOpLite : public OpLite {
mutable ConvParam param_; mutable ConvParam param_;
std::string padding_algorithm_{""}; std::string padding_algorithm_{""};
}; };
// update padding dilation
void UpdatePaddingAndDilation(std::vector<int>* paddings,
std::vector<int>* dilations,
const std::vector<int>& strides,
const std::string padding_algorithm,
const lite::DDim data_dims,
const lite::DDim& ksize);
} // namespace operators } // namespace operators
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/grid_sampler_op.h"
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace operators {
bool GridSamplerOp::CheckShape() const {
CHECK_OR_FALSE(param_.x);
CHECK_OR_FALSE(param_.out);
CHECK_OR_FALSE(param_.grid);
auto x_dims = param_.x->dims();
auto grid_dims = param_.grid->dims();
CHECK_EQ(x_dims.size(), 4UL) << "Input must have 4 dimensions.";
CHECK_EQ(grid_dims.size(), 4UL) << "Grid must have 4 dimensions.";
CHECK_EQ(grid_dims[0], x_dims[0])
<< "Input(X) dims[0] and Input(Grid) dims[0] should be equal.";
CHECK_EQ(grid_dims[1], x_dims[2])
<< "Input(X) dims[2] and Input(Grid) dims[1] should be equal.";
CHECK_EQ(grid_dims[2], x_dims[3])
<< "Input(X) dims[3] and Input(Grid) dims[2] should be equal.";
return true;
}
bool GridSamplerOp::InferShape() const {
auto x_dims = param_.x->dims();
param_.out->Resize(x_dims);
return true;
}
bool GridSamplerOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
param_.grid =
scope->FindVar(op_desc.Input("Grid").front())->GetMutable<Tensor>();
param_.out =
scope->FindVar(op_desc.Output("Output").front())->GetMutable<Tensor>();
return true;
}
} /* namespace operators */
} /* namespace lite */
} /* namespace paddle */
REGISTER_LITE_OP(grid_sampler, paddle::lite::operators::GridSamplerOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class GridSamplerOp : public OpLite {
public:
GridSamplerOp() {}
explicit GridSamplerOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "grid_sampler"; }
private:
mutable GridSamplerParam param_;
};
} /* namespace operators */
} /* namespace lite */
} /* namespace paddle */
...@@ -30,7 +30,7 @@ bool LayerNormOp::CheckShape() const { ...@@ -30,7 +30,7 @@ bool LayerNormOp::CheckShape() const {
bool LayerNormOp::InferShape() const { bool LayerNormOp::InferShape() const {
auto out_dims = param_.X->dims(); auto out_dims = param_.X->dims();
param_.Y->Resize(out_dims); param_.Y->Resize(out_dims);
auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[1]; auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[0];
param_.Mean->Resize(std::vector<int64_t>({inner_size})); param_.Mean->Resize(std::vector<int64_t>({inner_size}));
param_.Variance->Resize(std::vector<int64_t>({inner_size})); param_.Variance->Resize(std::vector<int64_t>({inner_size}));
......
...@@ -286,6 +286,8 @@ struct ConvParam { ...@@ -286,6 +286,8 @@ struct ConvParam {
std::string data_format{"Anylayout"}; std::string data_format{"Anylayout"};
// for activation // for activation
ActivationParam activation_param; ActivationParam activation_param;
// support var_length or not
bool var_length{false};
// for int8 // for int8
WITH_INT8_CONFIG WITH_INT8_CONFIG
}; };
...@@ -767,6 +769,12 @@ struct SequencePoolParam { ...@@ -767,6 +769,12 @@ struct SequencePoolParam {
#endif #endif
}; };
struct SequencePoolConcatParam {
std::vector<lite::Tensor*> X{};
lite::Tensor* Out{};
std::vector<std::string> pool_type{};
};
struct SearchGroupPaddingParam { struct SearchGroupPaddingParam {
lite::Tensor* x{}; lite::Tensor* x{};
lite::Tensor* out_emb_padding{}; lite::Tensor* out_emb_padding{};
...@@ -862,6 +870,8 @@ struct VarConv2DParam { ...@@ -862,6 +870,8 @@ struct VarConv2DParam {
int stride_w; int stride_w;
int kernel_h; int kernel_h;
int kernel_w; int kernel_w;
bool fuse_relu{false};
}; };
/// ----------------------- shape operators ---------------------- /// ----------------------- shape operators ----------------------
...@@ -1114,6 +1124,12 @@ struct InstanceNormParam { ...@@ -1114,6 +1124,12 @@ struct InstanceNormParam {
lite::Tensor* saved_variance{}; lite::Tensor* saved_variance{};
float epsilon; float epsilon;
}; };
/// --------------------- grid sampler operators --------------------
struct GridSamplerParam {
lite::Tensor* x{};
lite::Tensor* out{};
lite::Tensor* grid{};
};
} // namespace operators } // namespace operators
} // namespace lite } // namespace lite
......
...@@ -23,47 +23,10 @@ bool SequenceConcatOp::CheckShape() const { ...@@ -23,47 +23,10 @@ bool SequenceConcatOp::CheckShape() const {
CHECK_GT(param_.X.size(), 1) CHECK_GT(param_.X.size(), 1)
<< "The number of input sequences is at least two."; << "The number of input sequences is at least two.";
CHECK_OR_FALSE(param_.Out); CHECK_OR_FALSE(param_.Out);
size_t lod_size = 0;
for (const auto &t : param_.X) {
CHECK_EQ(t->lod().empty(), false)
<< "Input Tensor of X does not contain LoD information.";
// CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now.";
if (lod_size == 0) {
lod_size = t->lod()[0].size();
} else {
CHECK_EQ(t->lod()[0].size(), lod_size)
<< "The number of sequence must be same between each input";
}
}
CHECK_NE(lod_size, 0) << "Each input must have sequence information";
return true; return true;
} }
bool SequenceConcatOp::InferShape() const { bool SequenceConcatOp::InferShape() const { return true; }
int64_t batch_size = 0;
int64_t feature_size = 0;
std::vector<int64_t> out_dims;
for (const auto &tensor : param_.X) {
const auto x_dims = tensor->dims();
if (out_dims.empty()) {
out_dims = x_dims.Vectorize();
}
batch_size += x_dims[0];
if (feature_size == 0) {
feature_size = x_dims.production() / x_dims[0];
} else {
CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
<< "Inputs of sequence concat must have same feature size";
}
}
if (batch_size < 0) {
batch_size = -1; // Normalize batch size for compile time.
}
out_dims[0] = batch_size;
param_.Out->Resize(out_dims);
// LoD info will be computed in Kernel.
return true;
}
bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc, bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
lite::Scope *scope) { lite::Scope *scope) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/sequence_pool_concat_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool SequencePoolConcatOp::CheckShape() const {
CHECK_GE(param_.X.size(), 1)
<< "The number of input sequences is at least two.";
CHECK_OR_FALSE(param_.Out);
return true;
}
bool SequencePoolConcatOp::InferShape() const {
int out_dim = 0;
for (int i = 0; i < param_.X.size(); ++i) {
out_dim += param_.X[i]->dims().count(1, param_.X[i]->dims().size());
}
int seq_num = param_.X[0]->lod()[0].size() - 1;
std::vector<std::vector<uint64_t>> lod(1);
for (int i = 0; i < seq_num + 1; ++i) {
lod[0].push_back(i);
}
param_.Out->set_lod(lod);
param_.Out->Resize({seq_num, out_dim});
return true;
}
bool SequencePoolConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
lite::Scope *scope) {
auto input_list = opdesc.Input("X");
param_.X.clear();
for (auto var : input_list) {
param_.X.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
}
param_.Out =
scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
CHECK(param_.Out) << "Output(Out) of Sequence Concat Op should not be null.";
param_.pool_type = opdesc.GetAttr<std::vector<std::string>>("pooltype");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(sequence_pool_concat,
paddle::lite::operators::SequencePoolConcatOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class SequencePoolConcatOp : public OpLite {
public:
SequencePoolConcatOp() {}
explicit SequencePoolConcatOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "sequence_pool_concat"; }
private:
mutable SequencePoolConcatParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
...@@ -135,6 +135,15 @@ bool Transpose2Op::InferShape() const { ...@@ -135,6 +135,15 @@ bool Transpose2Op::InferShape() const {
out_dims[i] = x_dims[axis[i]]; out_dims[i] = x_dims[axis[i]];
} }
param_.output->Resize(out_dims); param_.output->Resize(out_dims);
std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
for (size_t i = 0; i < x_dims.size(); i++) {
xshape_dims[i + 1] = x_dims[i];
}
param_.xshape->Resize(xshape_dims);
auto xshape_lod = param_.xshape->mutable_lod();
*xshape_lod = param_.x->lod();
return true; return true;
} }
......
...@@ -19,28 +19,7 @@ namespace paddle { ...@@ -19,28 +19,7 @@ namespace paddle {
namespace lite { namespace lite {
namespace operators { namespace operators {
bool VarConv2dOp::CheckShape() const { bool VarConv2dOp::CheckShape() const { return true; }
auto x_dims = param_.X->dims();
CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2.";
auto w_dims = param_.W->dims();
CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor";
CHECK_EQ(w_dims[0], param_.output_channel)
<< "W dim[0] should be equal to OutputChannel";
CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w)
<< "W dim[1] should be equal to InputChannel * KernelH * KernelW";
LoD x_lod = param_.X->lod();
CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info.";
// CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted.";
CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted.";
CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()))
<< "The Input(X)'s lod info mismatches the actual tensor shape.";
// LoD row_lod = param_.ROW->lod();
// CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info.";
// LoD col_lod = param_.COLUMN->lod();
// CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod
// info.";
return true;
}
bool VarConv2dOp::InferShape() const { return true; } bool VarConv2dOp::InferShape() const { return true; }
...@@ -69,6 +48,10 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { ...@@ -69,6 +48,10 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
param_.kernel_w = opdesc.GetAttr<int>("KernelW"); param_.kernel_w = opdesc.GetAttr<int>("KernelW");
param_.stride_h = opdesc.GetAttr<int>("StrideH"); param_.stride_h = opdesc.GetAttr<int>("StrideH");
param_.stride_w = opdesc.GetAttr<int>("StrideW"); param_.stride_w = opdesc.GetAttr<int>("StrideW");
if (opdesc.HasAttr("fuse_relu")) {
param_.fuse_relu = opdesc.GetAttr<bool>("fuse_relu");
}
return true; return true;
} }
......
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
...@@ -15,6 +15,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE ...@@ -15,6 +15,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
...@@ -24,6 +25,9 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE ...@@ -24,6 +25,9 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
#lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
if(LITE_BUILD_EXTRA) if(LITE_BUILD_EXTRA)
lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
#include "lite/tests/utils/fill_data.h"
namespace paddle {
namespace lite {
class GridSamplerComputeTest : public arena::TestCase {
protected:
// common attributes for this op.
std::string input_ = "x";
std::string output_ = "y";
std::string grid_ = "grid";
DDim dims_{{4, 5, 19, 19}};
public:
GridSamplerComputeTest(const Place& place,
const std::string& alias,
DDim dims)
: TestCase(place, alias), dims_(dims) {}
void RunBaseline(Scope* scope) override {
auto x = scope->FindTensor(input_);
auto grid = scope->FindTensor(grid_);
auto out = scope->NewTensor(output_);
CHECK(out);
out->Resize(dims_);
const float* x_data = x->data<float>();
const float* grid_data = grid->data<float>();
float* out_data = out->mutable_data<float>();
int num = x->dims()[0];
int channel = x->dims()[1];
int height = x->dims()[2];
int width = x->dims()[3];
int spatial_size = height * width;
auto inbound = [](int x, int y, float x_max, float y_max) {
if (x < 0 || x > x_max || y < 0 || y > y_max) {
return false;
}
return true;
};
for (int n = 0; n < num; ++n) {
const float* x_n = x_data + n * channel * height * width;
float* out_n = out_data + n * channel * height * width;
const float* grid_n = grid_data + n * height * width * 2;
for (int c = 0; c < channel; ++c) {
const float* x_c = x_n + c * spatial_size;
float* out_c = out_n + c * spatial_size;
for (int s = 0; s < spatial_size; ++s) {
float x = grid_n[s * 2];
float y = grid_n[s * 2 + 1];
float xwf = (x + 1.f) * 0.5 * (width - 1);
float ynf = (y + 1.f) * 0.5 * (height - 1);
int xw = floor(xwf);
int xe = xw + 1;
int yn = floor(ynf);
int ys = yn + 1;
float dw = xwf - xw;
float de = xe - xwf;
float dn = ynf - yn;
float ds = ys - ynf;
float wn = inbound(xw,
yn,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[yn * width + xw]
: 0.f;
float en = inbound(xe,
yn,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[yn * width + xe]
: 0.f;
float ws = inbound(xw,
ys,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[ys * width + xw]
: 0.f;
float es = inbound(xe,
ys,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[ys * width + xe]
: 0.f;
out_c[s] = wn * de * ds + en * dw * ds + ws * de * dn + es * dw * dn;
}
}
}
}
void PrepareOpDesc(cpp::OpDesc* op_desc) {
op_desc->SetType("grid_sampler");
op_desc->SetInput("X", {input_});
op_desc->SetInput("Grid", {grid_});
op_desc->SetOutput("Output", {output_});
}
void PrepareData() override {
std::vector<float> din(dims_.production());
fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
DDim gird_dims{{dims_[0], dims_[2], dims_[3], 2}};
std::vector<float> grid(gird_dims.production());
fill_data_rand(grid.data(), -1.f, 1.f, gird_dims.production());
SetCommonTensor(input_, dims_, din.data());
SetCommonTensor(grid_, gird_dims, grid.data());
}
};
void test_grid_sampler(Place place) {
for (auto& n : {1, 13}) {
for (auto& c : {1, 3, 8}) {
for (auto& h : {1, 3, 8, 64}) {
for (auto& w : {2, 4, 9, 63}) {
DDim dim_in({n, c, h, w});
std::unique_ptr<arena::TestCase> tester(
new GridSamplerComputeTest(place, "def", dim_in));
#ifdef LITE_WITH_ARM
auto& ctx = tester->context()->As<ARMContext>();
ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
#endif
arena::Arena arena(std::move(tester), place, 6e-5);
LOG(INFO) << "run n: " << n << ", c: " << c << ", h: " << h
<< ", w: " << w;
if (!arena.TestPrecision()) {
LOG(ERROR) << "No Pass!!";
return;
}
// if you want to test this op performance, uncomment the following
// line
// arena.TestPerformance();
}
}
}
}
}
TEST(GridSampler, precision) {
#ifdef LITE_WITH_ARM
Place place(TARGET(kARM));
test_grid_sampler(place);
#endif
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
#include "lite/tests/utils/fill_data.h"
namespace paddle {
namespace lite {
class LayerNormComputeTest : public arena::TestCase {
protected:
// common attributes for this op.
std::string op_type_ = "layer_norm";
std::string input_ = "x";
std::string scale_ = "scale";
std::string bias_ = "bias";
std::string output_ = "y";
std::string mean_ = "mean";
std::string variance_ = "variance";
DDim dims_{{4, 5, 19, 19}};
float epsilon_ = 1e-5f;
int begin_norm_axis_ = 1;
bool has_bias_ = true;
bool has_scale_ = true;
public:
LayerNormComputeTest(const Place& place,
const std::string& alias,
DDim dims,
float epsilon,
int begin_norm_axis,
bool has_bias,
bool has_scale)
: TestCase(place, alias),
dims_(dims),
epsilon_(epsilon),
begin_norm_axis_(begin_norm_axis),
has_bias_(has_bias),
has_scale_(has_scale) {}
void RunBaseline(Scope* scope) override {
auto x = scope->FindTensor(input_);
auto scale = scope->FindTensor(scale_);
auto bias = scope->FindTensor(bias_);
auto y = scope->NewTensor(output_);
auto mean = scope->NewTensor(mean_);
auto variance = scope->NewTensor(variance_);
CHECK(y);
CHECK(mean);
CHECK(variance);
y->Resize(dims_);
auto matrix_dim = dims_.Flatten2D(begin_norm_axis_);
int batch_size = matrix_dim[0];
int feature_size = matrix_dim[1];
mean->Resize(std::vector<int64_t>{batch_size});
variance->Resize(std::vector<int64_t>{batch_size});
auto* x_data = x->data<float>();
auto* scale_data = (scale == nullptr ? nullptr : scale->data<float>());
auto* bias_data = (bias == nullptr ? nullptr : bias->data<float>());
auto* out_data = y->mutable_data<float>();
auto* mean_data = mean->mutable_data<float>();
auto* variance_data = variance->mutable_data<float>();
for (int i = 0; i < batch_size; ++i) {
int start = i * feature_size;
int end = start + feature_size;
float mean_t = 0;
float variance_t = 0;
for (int j = start; j < end; ++j) {
mean_t += x_data[j];
variance_t += x_data[j] * x_data[j];
}
mean_t /= feature_size;
variance_t = variance_t / feature_size - mean_t * mean_t;
mean_data[i] = mean_t;
variance_data[i] = variance_t;
variance_t = sqrt(variance_t + epsilon_);
for (int j = start; j < end; ++j) {
out_data[j] = (x_data[j] - mean_t) / variance_t;
if (scale_data) {
out_data[j] *= scale_data[j - start];
}
if (bias_data) {
out_data[j] += bias_data[j - start];
}
}
}
}
void PrepareOpDesc(cpp::OpDesc* op_desc) {
op_desc->SetType(op_type_);
op_desc->SetInput("X", {input_});
op_desc->SetInput("Bias", {bias_});
op_desc->SetInput("Scale", {scale_});
op_desc->SetOutput("Y", {output_});
op_desc->SetOutput("Mean", {mean_});
op_desc->SetOutput("Variance", {variance_});
op_desc->SetAttr("epsilon", epsilon_);
op_desc->SetAttr("begin_norm_axis", begin_norm_axis_);
}
void PrepareData() override {
std::vector<float> din(dims_.production());
fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
std::vector<int64_t> scale_v;
for (size_t i = begin_norm_axis_; i < dims_.size(); i++) {
scale_v.push_back(dims_[i]);
}
DDim scale_dim(scale_v);
std::vector<float> scale(scale_dim.production());
fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production());
std::vector<float> bias(scale_dim.production());
fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production());
SetCommonTensor(input_, dims_, din.data());
SetCommonTensor(scale_, scale_dim, scale.data());
SetCommonTensor(bias_, scale_dim, bias.data());
}
};
TEST(LayerNorm, precision) {
LOG(INFO) << "test layer_norm op";
float abs_error = 2e-5;
Place place;
#if defined(LITE_WITH_XPU)
place = TARGET(kXPU);
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
abs_error = 6e-5;
#else
return;
#endif
std::vector<std::vector<int64_t>> dims{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}};
for (auto dim_in : dims) {
for (auto epsilon : {1e-5f}) {
for (auto axis : {0, 1, 2, 3}) {
for (bool has_bias : {true, false}) {
for (bool has_scale : {true, false}) {
if (axis >= dim_in.size()) continue;
std::unique_ptr<arena::TestCase> tester(
new LayerNormComputeTest(place,
"def",
DDim(dim_in),
epsilon,
axis,
has_bias,
has_scale));
#ifdef LITE_WITH_ARM
auto& ctx = tester->context()->As<ARMContext>();
ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 4);
#endif
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision({"mean", "variance"});
}
}
}
}
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
namespace paddle {
namespace lite {
class ReshapeComputeTester : public arena::TestCase {
protected:
// common attributes for this op.
std::string op_type_ = "reshape2";
std::string input_ = "x";
std::string output_ = "out";
std::string xshape_ = "xshape";
std::vector<std::string> shape_tensor_vct_;
std::string shape_tensor_;
DDim x_dims_;
std::vector<int> shape_;
bool inplace_ = false;
public:
ReshapeComputeTester(const Place& place,
const std::string& alias,
DDim x_dims,
std::vector<int> shape,
bool is_shape_tensor_vct = false,
bool is_shape_tensor = false,
bool is_shape = true)
: TestCase(place, alias), x_dims_(x_dims) {
if (is_shape_tensor_vct) {
for (size_t i = 0; i < shape.size(); i++) {
shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
}
} else if (is_shape_tensor) {
shape_tensor_ = op_type_ + "/shape";
} else if (is_shape) {
shape_ = shape;
} else {
LOG(FATAL) << "must set new shape!";
}
}
void RunBaseline(Scope* scope) override {
auto* out = scope->NewTensor(output_);
CHECK(out);
auto* x = scope->FindTensor(input_);
auto x_dims = x->dims();
std::vector<int> out_shape;
if (shape_tensor_vct_.size() > 0) {
for (auto shape_tensor : shape_tensor_vct_) {
out_shape.push_back(scope->FindTensor(shape_tensor)->data<int>()[0]);
}
} else if (!shape_tensor_.empty()) {
auto shape_tensor = scope->FindTensor(shape_tensor_);
auto shape_tensor_data = shape_tensor->data<int>();
out_shape = std::vector<int>(shape_tensor_data,
shape_tensor_data + shape_tensor->numel());
} else if (!shape_.empty()) {
out_shape = shape_;
} else {
LOG(FATAL) << "must set new shape!";
}
std::vector<int64_t> final_out_shape(out_shape.size(), 1);
int unk_dim_idx = -1;
int cap = 1;
for (size_t i = 0; i < out_shape.size(); i++) {
if (out_shape[i] == -1) {
CHECK_EQ(unk_dim_idx, -1);
unk_dim_idx = i;
} else if (out_shape[i] == 0) {
CHECK_LE(i, x_dims.size());
final_out_shape[i] = x_dims[i];
} else if (out_shape[i] > 0) {
final_out_shape[i] = out_shape[i];
} else {
LOG(FATAL) << "invalid shape";
}
cap *= final_out_shape[i];
}
if (unk_dim_idx > -1) {
final_out_shape[unk_dim_idx] = x_dims.production() / cap;
}
out->Resize(final_out_shape);
auto x_data = x->data<float>();
auto out_data = out->mutable_data<float>();
memcpy(out_data, x_data, sizeof(float) * x_dims.production());
if (op_type_ == "reshape2") {
auto* xshape = scope->NewTensor(xshape_);
auto xshape_dims = x_dims.Vectorize();
xshape_dims.insert(xshape_dims.begin(), 0);
xshape->Resize(xshape_dims);
}
}
void PrepareOpDesc(cpp::OpDesc* op_desc) {
op_desc->SetType(op_type_);
op_desc->SetInput("X", {input_});
if (shape_tensor_vct_.size() > 0) {
op_desc->SetInput("ShapeTensor", shape_tensor_vct_);
} else if (!shape_tensor_.empty()) {
op_desc->SetInput("Shape", {shape_tensor_});
} else if (shape_.size() > 0) {
op_desc->SetAttr("shape", shape_);
} else {
LOG(FATAL) << "invalid shape";
}
op_desc->SetOutput("Out", {output_});
if (op_type_ == "reshape2") {
op_desc->SetOutput("XShape", {xshape_});
}
op_desc->SetAttr("inplace", inplace_);
}
void PrepareData() override {
std::vector<float> data(x_dims_.production());
for (int i = 0; i < x_dims_.production(); i++) {
data[i] = i * 1.1;
}
SetCommonTensor(input_, x_dims_, data.data());
if (shape_tensor_vct_.size() > 0) {
for (size_t i = 0; i < shape_.size(); i++) {
std::vector<int> shape_data{shape_[i]};
SetCommonTensor(shape_tensor_vct_[i],
DDim(std::vector<int64_t>{1}),
shape_data.data());
}
}
if (!shape_tensor_.empty()) {
SetCommonTensor(
shape_tensor_,
DDim(std::vector<int64_t>{static_cast<int64_t>(shape_.size())}),
shape_.data());
}
}
};
TEST(Reshape, precision) {
LOG(INFO) << "test Reshape op";
float abs_error = 2e-5;
Place place;
#ifdef LITE_WITH_XPU
place = TARGET(kXPU);
#else
return;
#endif
DDim x_dims{{2, 3, 4, 5}};
std::vector<std::vector<int>> shapes{{5, 4, 3, 2},
{2, 3, 20},
{2, 60},
{120},
{2, 3, -1},
{0, 0, 20},
{0, 0, -1}};
for (auto shape : shapes) {
std::unique_ptr<arena::TestCase> tester(
new ReshapeComputeTester(place, "def", x_dims, shape));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision({"xshape"});
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
namespace paddle {
namespace lite {
int data_index(std::vector<int> pos, DDimLite dims) {
int d1 = dims[1];
int d2 = dims[2];
int d3 = dims[3];
return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
}
std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
std::vector<int> out_pos(in_pos.size());
for (int i = 0; i < axis.size(); i++) {
out_pos[axis[i]] = in_pos[i];
}
return out_pos;
}
class TransposeComputeTester : public arena::TestCase {
protected:
// common attributes for this op.
std::string op_type_ = "transpose2";
std::string input_ = "x";
std::string output_ = "out";
std::string xshape_ = "xshape";
DDim x_dims_;
std::vector<int> axis_;
public:
TransposeComputeTester(const Place& place,
const std::string& alias,
DDim x_dims,
std::vector<int> axis)
: TestCase(place, alias), x_dims_(x_dims), axis_(axis) {}
void RunBaseline(Scope* scope) override {
auto* out = scope->NewTensor(output_);
CHECK(out);
auto* x = scope->FindTensor(input_);
auto x_dims = x->dims();
std::vector<int64_t> out_shape(x_dims.size(), 0);
for (size_t i = 0; i < x_dims.size(); i++) {
out_shape[i] = x_dims[axis_[i]];
}
out->Resize(out_shape);
auto y_dims = out->dims();
int input_n = x_dims[0];
int input_c = x_dims[1];
int input_h = x_dims[2];
int input_w = x_dims[3];
auto input_data = x->data<float>();
auto output_data = out->mutable_data<float>();
for (int n = 0; n < input_n; ++n) {
for (int c = 0; c < input_c; ++c) {
for (int h = 0; h < input_h; ++h) {
for (int w = 0; w < input_w; ++w) {
std::vector<int> in_pos{n, c, h, w};
std::vector<int> out_pos = pos_trans(in_pos, axis_);
int in_index = data_index(in_pos, x_dims);
int out_index = data_index(out_pos, y_dims);
output_data[out_index] = input_data[in_index];
}
}
}
}
if (op_type_ == "transpose2") {
auto* xshape = scope->NewTensor(xshape_);
auto xshape_dims = x_dims.Vectorize();
xshape_dims.insert(xshape_dims.begin(), 0);
xshape->Resize(xshape_dims);
}
}
void PrepareOpDesc(cpp::OpDesc* op_desc) {
op_desc->SetType(op_type_);
op_desc->SetInput("X", {input_});
op_desc->SetOutput("Out", {output_});
if (op_type_ == "transpose2") {
op_desc->SetOutput("XShape", {xshape_});
}
op_desc->SetAttr("axis", axis_);
}
void PrepareData() override {
std::vector<float> data(x_dims_.production());
for (int i = 0; i < x_dims_.production(); i++) {
data[i] = i * 1.1;
}
SetCommonTensor(input_, x_dims_, data.data());
}
};
TEST(Transpose, precision) {
LOG(INFO) << "test Transpose op";
float abs_error = 2e-5;
Place place;
#ifdef LITE_WITH_XPU
place = TARGET(kXPU);
#else
return;
#endif
DDim x_dims{{2, 3, 4, 5}};
// [XPU]: {3, 1, 0, 2} is unsupported
std::vector<std::vector<int>> axes{
{0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}};
for (auto axis : axes) {
std::unique_ptr<arena::TestCase> tester(
new TransposeComputeTester(place, "def", x_dims, axis));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision({"xshape"});
}
}
} // namespace lite
} // namespace paddle
...@@ -355,7 +355,8 @@ void test_pool_fp32(const std::vector<DDim>& input_dims, ...@@ -355,7 +355,8 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
LOG(FATAL) << "test fp32 pool: input: " << dim_in LOG(FATAL) << "test fp32 pool: input: " << dim_in
<< ", output: " << dim_out << ", output: " << dim_out
<< ", kernel dim: " << ksize[0] << ", " << ksize[1] << ", kernel dim: " << ksize[0] << ", " << ksize[1]
<< ", pad: " << pads[0] << ", " << pads[1] << ", pad: " << pads[0] << ", " << pads[1] << ", "
<< pads[2] << ", " << pads[3]
<< ", stride: " << strides[0] << ", " << strides[1] << ", stride: " << strides[0] << ", " << strides[1]
<< ", global_pooling: " << ", global_pooling: "
<< (flag_global ? "global" : "false") << (flag_global ? "global" : "false")
...@@ -370,6 +371,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims, ...@@ -370,6 +371,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
LOG(INFO) << "test fp32 pool: input: " << dim_in LOG(INFO) << "test fp32 pool: input: " << dim_in
<< ", output: " << dim_out << ", kernel dim: " << ksize[0] << ", output: " << dim_out << ", kernel dim: " << ksize[0]
<< ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1] << ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1]
<< ", " << pads[2] << ", " << pads[3]
<< ", stride: " << strides[0] << ", " << strides[1] << ", stride: " << strides[0] << ", " << strides[1]
<< ", global_pooling: " << (flag_global ? "global" : "false") << ", global_pooling: " << (flag_global ? "global" : "false")
<< ", pooling_type: " << pooling_type << ", pooling_type: " << pooling_type
......
lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser) if(NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
return()
endif()
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL) lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
DEPS DEPS
cxx_api cxx_api
debug_utils debug_utils
...@@ -16,4 +18,3 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL) ...@@ -16,4 +18,3 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
CL_DEPS ${opencl_kernels}) CL_DEPS ${opencl_kernels})
endif()
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
#include "lite/model_parser/pb/var_desc.h" #include "lite/model_parser/pb/var_desc.h"
#include "lite/utils/all.h" #include "lite/utils/all.h"
DEFINE_string(model_path, "", "Model dir path"); DEFINE_string(model_dir, "", "Model dir path");
DEFINE_string(input_file, "", "Input datas file path"); DEFINE_string(input_file, "", "Input datas file path");
DEFINE_string(topo_output_file, "", "Runtime topology order output file path"); DEFINE_string(topo_output_file, "", "Runtime topology order output file path");
DEFINE_bool(output_topo, true, "Dump runtime topology or not"); DEFINE_bool(output_topo, true, "Dump runtime topology or not");
...@@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) { ...@@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) {
CHECK(conf); CHECK(conf);
#define CHECK_NON_EMPTY(name__) \ #define CHECK_NON_EMPTY(name__) \
CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty." CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty."
CHECK_NON_EMPTY(model_path); CHECK_NON_EMPTY(model_dir);
if (FLAGS_output_topo) { if (FLAGS_output_topo) {
CHECK_NON_EMPTY(topo_output_file); CHECK_NON_EMPTY(topo_output_file);
} }
...@@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) { ...@@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) {
CHECK_NON_EMPTY(tensor_output_file); CHECK_NON_EMPTY(tensor_output_file);
} }
#undef CHECK_NON_EMPTY #undef CHECK_NON_EMPTY
conf->model_dir = FLAGS_model_path; conf->model_dir = FLAGS_model_dir;
conf->topo_output_file = FLAGS_topo_output_file; conf->topo_output_file = FLAGS_topo_output_file;
conf->tensor_output_file = FLAGS_tensor_output_file; conf->tensor_output_file = FLAGS_tensor_output_file;
conf->input_file = FLAGS_input_file; conf->input_file = FLAGS_input_file;
......
...@@ -33,28 +33,49 @@ namespace paddle_mobile { ...@@ -33,28 +33,49 @@ namespace paddle_mobile {
static const char *ANDROID_LOG_TAG = static const char *ANDROID_LOG_TAG =
"paddle_mobile LOG built on " __DATE__ " " __TIME__; "paddle_mobile LOG built on " __DATE__ " " __TIME__;
#ifdef PADDLE_ENABLE_COLORABLE_LOG
#define PADDLE_RED "\033[1;31;40m"
#define PADDLE_GREEN "\033[1;32;40m"
#define PADDLE_YELLOW "\033[1;33;40m"
#define PADDLE_LIGHT_RED "\033[1;35;40m"
#define PADDLE_BLUE "\033[1;34;40m"
#define PADDLE_WHITE "\033[1;37;40m"
#define PADDLE_CONON "\033[0m"
#else
#define PADDLE_RED ""
#define PADDLE_GREEN ""
#define PADDLE_YELLOW ""
#define PADDLE_LIGHT_RED ""
#define PADDLE_BLUE ""
#define PADDLE_WHITE ""
#define PADDLE_CONON ""
#endif
#define ANDROIDLOGI(...) \ #define ANDROIDLOGI(...) \
__android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \ __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
fprintf(stderr, "%s\n", __VA_ARGS__); \ fprintf(stderr, PADDLE_YELLOW "%s\n" PADDLE_CONON, __VA_ARGS__); \
fflush(stderr) fflush(stderr)
#define ANDROIDLOGW(...) \ #define ANDROIDLOGW(...) \
__android_log_print(ANDROID_LOG_WARNING, ANDROID_LOG_TAG, __VA_ARGS__); \ __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, __VA_ARGS__); \
fprintf(stderr, "%s\n", __VA_ARGS__); \ fprintf(stderr, PADDLE_LIGHT_RED "%s\n" PADDLE_CONON, __VA_ARGS__); \
fflush(stderr) fflush(stderr)
#define ANDROIDLOGD(...) \ #define ANDROIDLOGD(...) \
__android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \ __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
fprintf(stderr, "%s\n", __VA_ARGS__); \ fprintf(stderr, PADDLE_WHITE "%s\n" PADDLE_CONON, __VA_ARGS__); \
fflush(stderr) fflush(stderr)
#define ANDROIDLOGE(...) \ #define ANDROIDLOGE(...) \
__android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \ __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
fprintf(stderr, "%s\n", __VA_ARGS__); \ fprintf(stderr, PADDLE_RED "%s\n" PADDLE_CONON, __VA_ARGS__); \
fflush(stderr)
#define ANDROIDLOGV(...) \
__android_log_print(ANDROID_LOG_VERBOSE, ANDROID_LOG_TAG, __VA_ARGS__); \
fprintf(stderr, PADDLE_GREEN "%s\n" PADDLE_CONON, __VA_ARGS__); \
fflush(stderr) fflush(stderr)
#else #else
#define ANDROIDLOGI(...) #define ANDROIDLOGI(...)
#define ANDROIDLOGW(...) #define ANDROIDLOGW(...)
#define ANDROIDLOGD(...) #define ANDROIDLOGD(...)
#define ANDROIDLOGE(...) #define ANDROIDLOGE(...)
#define ANDROIDLOGV(...)
#endif #endif
...@@ -63,6 +84,7 @@ enum LogLevel { ...@@ -63,6 +84,7 @@ enum LogLevel {
kLOG_ERROR, kLOG_ERROR,
kLOG_WARNING, kLOG_WARNING,
kLOG_INFO, kLOG_INFO,
kLOG_VERBOSE,
kLOG_DEBUG, kLOG_DEBUG,
kLOG_DEBUG1, kLOG_DEBUG1,
kLOG_DEBUG2, kLOG_DEBUG2,
...@@ -73,9 +95,9 @@ enum LogLevel { ...@@ -73,9 +95,9 @@ enum LogLevel {
// log level // log level
static LogLevel log_level = kLOG_DEBUG4; static LogLevel log_level = kLOG_DEBUG4;
static std::vector<std::string> logs{"NO", "ERROR ", "WARNING", static std::vector<std::string> logs{"NO ", "ERROR ", "WARNING", "INFO ",
"INFO ", "DEBUG ", "DEBUG1 ", "VERBOSE", "DEBUG ", "DEBUG1 ", "DEBUG2 ",
"DEBUG2 ", "DEBUG3 ", "DEBUG4 "}; "DEBUG3 ", "DEBUG4 "};
struct ToLog; struct ToLog;
struct Print; struct Print;
...@@ -97,9 +119,27 @@ struct Print { ...@@ -97,9 +119,27 @@ struct Print {
#else #else
std::cerr << buffer_.str() << std::endl; std::cerr << buffer_.str() << std::endl;
#endif #endif
} else { } else if (level == kLOG_INFO) {
#ifdef ANDROID #ifdef ANDROID
ANDROIDLOGI(buffer_.str().c_str()); ANDROIDLOGI(buffer_.str().c_str());
#else
std::cerr << buffer_.str() << std::endl;
#endif
} else if (level == kLOG_VERBOSE) {
#ifdef ANDROID
ANDROIDLOGV(buffer_.str().c_str());
#else
std::cerr << buffer_.str() << std::endl;
#endif
} else if (level == kLOG_WARNING) {
#ifdef ANDROID
ANDROIDLOGW(buffer_.str().c_str());
#else
std::cerr << buffer_.str() << std::endl;
#endif
} else {
#ifdef ANDROID
ANDROIDLOGD(buffer_.str().c_str());
#else #else
std::cout << buffer_.str() << std::endl; std::cout << buffer_.str() << std::endl;
#endif #endif
...@@ -131,6 +171,7 @@ struct ToLog { ...@@ -131,6 +171,7 @@ struct ToLog {
#define LOG(level) \ #define LOG(level) \
if (level > paddle_mobile::log_level) { \ if (level > paddle_mobile::log_level) { \
/* NOLINTNEXTLINE */ \
} else \ } else \
paddle_mobile::ToLog( \ paddle_mobile::ToLog( \
level, static_cast<const std::stringstream &>( \ level, static_cast<const std::stringstream &>( \
...@@ -143,6 +184,7 @@ struct ToLog { ...@@ -143,6 +184,7 @@ struct ToLog {
#define DLOG \ #define DLOG \
if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \ if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
/* NOLINTNEXTLINE */ \
} else \ } else \
paddle_mobile::ToLog( \ paddle_mobile::ToLog( \
paddle_mobile::kLOG_DEBUG, \ paddle_mobile::kLOG_DEBUG, \
...@@ -156,11 +198,13 @@ struct ToLog { ...@@ -156,11 +198,13 @@ struct ToLog {
#define LOGF(level, format, ...) \ #define LOGF(level, format, ...) \
if (level > paddle_mobile::log_level) { \ if (level > paddle_mobile::log_level) { \
/* NOLINTNEXTLINE */ \
} else \ } else \
printf(format, ##__VA_ARGS__) printf(format, ##__VA_ARGS__)
#define DLOGF(format, ...) \ #define DLOGF(format, ...) \
if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \ if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
/* NOLINTNEXTLINE */ \
} else \ } else \
printf(format, ##__VA_ARGS__) printf(format, ##__VA_ARGS__)
...@@ -170,12 +214,14 @@ struct ToLog { ...@@ -170,12 +214,14 @@ struct ToLog {
#define ANDROIDLOGW(...) #define ANDROIDLOGW(...)
#define ANDROIDLOGD(...) #define ANDROIDLOGD(...)
#define ANDROIDLOGE(...) #define ANDROIDLOGE(...)
#define ANDROIDLOGV(...)
enum LogLevel { enum LogLevel {
kNO_LOG, kNO_LOG,
kLOG_ERROR, kLOG_ERROR,
kLOG_WARNING, kLOG_WARNING,
kLOG_INFO, kLOG_INFO,
kLOG_VERBOSE,
kLOG_DEBUG, kLOG_DEBUG,
kLOG_DEBUG1, kLOG_DEBUG1,
kLOG_DEBUG2, kLOG_DEBUG2,
...@@ -193,7 +239,7 @@ struct Print { ...@@ -193,7 +239,7 @@ struct Print {
}; };
struct ToLog { struct ToLog {
ToLog(LogLevel level) {} explicit ToLog(LogLevel level) {}
template <typename T> template <typename T>
ToLog &operator<<(T const &value) { ToLog &operator<<(T const &value) {
...@@ -201,14 +247,16 @@ struct ToLog { ...@@ -201,14 +247,16 @@ struct ToLog {
} }
}; };
#define LOG(level) \ #define LOG(level) \
if (true) { \ if (true) { \
} else \ /* NOLINTNEXTLINE */ \
} else \
paddle_mobile::ToLog(level) paddle_mobile::ToLog(level)
#define DLOG \ #define DLOG \
if (true) { \ if (true) { \
} else \ /* NOLINTNEXTLINE */ \
} else \
paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG) paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG)
#define LOGF(level, format, ...) #define LOGF(level, format, ...)
......
...@@ -134,6 +134,8 @@ const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE = ...@@ -134,6 +134,8 @@ const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
"fill_constant_batch_size_like"; "fill_constant_batch_size_like";
const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu"; const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle"; const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
const char *G_OP_TYPE_EXPAND = "expand";
const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler";
std::unordered_map< std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
...@@ -156,7 +158,7 @@ std::unordered_map< ...@@ -156,7 +158,7 @@ std::unordered_map<
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}}, {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
{G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}}, {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
{G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}}, {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}},
{G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}}, {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_LRN, {{"X"}, {"Out"}}}, {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
{G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}}, {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
...@@ -258,5 +260,7 @@ std::unordered_map< ...@@ -258,5 +260,7 @@ std::unordered_map<
{{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}}, {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
{G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}, {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
{G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}}; {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}},
{G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}},
{G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}};
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -265,6 +265,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN; ...@@ -265,6 +265,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU; extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU; extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
extern const char *G_OP_TYPE_PIXEL_SHUFFLE; extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
extern const char *G_OP_TYPE_EXPAND;
extern const char *G_OP_TYPE_GRID_SAMPLER;
extern std::unordered_map< extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "fpga/V2/bias_scale.h" #include "fpga/V2/bias_scale.h"
#include <memory.h>
#include <math.h> #include <math.h>
#include <memory.h>
#include "fpga/common/fpga_common.h" #include "fpga/common/fpga_common.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -56,15 +56,16 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -56,15 +56,16 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
*data_in = ptr_aligned; *data_in = ptr_aligned;
} }
void fixed_scale_bias_new(void*data_in, int data_len) { void fixed_scale_bias_new(void *data_in, int data_len) {
int* data_tmp = static_cast<int*>(data_in); int *data_tmp = static_cast<int *>(data_in);
for (int idx = 0; idx < data_len/2; ++idx) { for (int idx = 0; idx < data_len / 2; ++idx) {
float tmp = (static_cast<float*>(data_in))[idx]; float tmp = (static_cast<float *>(data_in))[idx];
data_tmp[idx] = static_cast<int>(round(tmp*pow(2.0, 23.0))); data_tmp[idx] = static_cast<int>(round(tmp * pow(2.0, 23.0)));
tmp = (static_cast<float*>(data_in))[idx+data_len/2]; tmp = (static_cast<float *>(data_in))[idx + data_len / 2];
data_tmp[idx+data_len/2] = static_cast<int>(round(tmp*pow(2.0, 30.0))); data_tmp[idx + data_len / 2] =
} static_cast<int>(round(tmp * pow(2.0, 30.0)));
return; }
return;
} }
void interleave(float **data_in, int num_after_alignment) { void interleave(float **data_in, int num_after_alignment) {
......
...@@ -94,11 +94,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, ...@@ -94,11 +94,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
for (i = 0; i < image_num; i++) { for (i = 0; i < image_num; i++) {
align_each_in_area_cw = align_each_in_area_cw =
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
memcpy( memcpy((int8_t *)image_out + tmp_channel + // NOLINT
(int8_t *)image_out + tmp_channel + // NOLINT k * align_each_out_area_cw_differ,
k * align_each_out_area_cw_differ, images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, channel_num[i] * sizeof(int8_t));
channel_num[i] * sizeof(int8_t));
tmp_channel += channel_num[i]; tmp_channel += channel_num[i];
} }
......
...@@ -257,8 +257,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -257,8 +257,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
// reg_writeq(reg_ActivationArgs, // reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
// new // new
...@@ -274,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -274,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
args.driver.filter_pad_width_mul_channel, args.driver.filter_pad_width_mul_channel,
REG_CONV_REG1); REG_CONV_REG1);
reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) | reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
(args.driver.filter_row << 10) | (args.driver.filter_row << 10) |
(args.driver.filter_height << 5) | args.driver.filter_width, (args.driver.filter_height << 5) | args.driver.filter_width,
REG_CONV_REG2); REG_CONV_REG2);
reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
(args.driver.prog_full_cnt << 16) | (args.driver.prog_full_cnt << 16) |
...@@ -369,74 +369,77 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -369,74 +369,77 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
uint64_t cmd = 0; uint64_t cmd = 0;
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0; uint64_t output_physical_address = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
image_physical_address = vaddr_to_paddr(args.image.address); image_physical_address = vaddr_to_paddr(args.image.address);
output_physical_address = vaddr_to_paddr(args.output.address); output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64); uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
uint64_t output_height = (uint64_t)( uint64_t output_height = (uint64_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) / (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h + 1); args.kernel.stride_h +
1);
uint64_t output_width = (uint64_t)( uint64_t output_width = (uint64_t)(
(args.image.width + args.image.pad_width * 2 - args.kernel.width) / (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w + 1); args.kernel.stride_w +
1);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row = (uint64_t)args.image.width * uint64_t image_one_pad_per_row =
(uint64_t)args.image.channels +(uint64_t)args.image.pad_width * (uint64_t)args.image.width * (uint64_t)args.image.channels +
(uint64_t)args.image.channels; (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width * uint64_t result_amount_align_32 =
(uint64_t)args.image.channels, 32); align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
uint64_t result_addr_row = uint64_t result_addr_row =
(result_amount_align_32 << 32) | output_physical_address; (result_amount_align_32 << 32) | output_physical_address;
uint64_t row_padding_down = uint64_t row_padding_down =
(uint64_t)args.image.height + (uint64_t)args.image.pad_height; (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t kernel_width_sub1 = uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
(uint64_t)args.kernel.width - 1;
uint64_t kernel_padding_step = row_padding_down | uint64_t kernel_padding_step = row_padding_down |
((uint64_t)args.image.pad_height << 16) | ((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) | ((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1<<32) | ((uint64_t)kernel_width_sub1 << 32) |
((uint64_t)args.kernel.height << 40) | ((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height-1) << 48); ((uint64_t)(args.kernel.height - 1) << 48);
uint64_t image_calcu_height = (uint64_t)args.kernel.height + uint64_t image_calcu_height =
(output_height - 1) * (uint64_t)args.kernel.stride_h; (uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t result_size_calcu_height = (output_height - 1) | uint64_t result_size_calcu_height = (output_height - 1) |
((output_width - 1) << 16) | (image_calcu_height << 32); ((output_width - 1) << 16) |
uint64_t col_padding_down = ((uint64_t)args.image.width + (image_calcu_height << 32);
(uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; uint64_t col_padding_down =
((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
(uint64_t)args.image.channels;
uint64_t image_row_col_padding_down = uint64_t image_row_col_padding_down =
image_amount_per_row | (col_padding_down << 32); image_amount_per_row | (col_padding_down << 32);
uint64_t image_rowXpadding_h = uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height; image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h = uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h; image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h = uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32); image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w = uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width; (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w = uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w = uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32); channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align = uint64_t filter_row_align = C_align_32 * (uint64_t)args.kernel.width;
C_align_32 * (uint64_t)args.kernel.width; uint64_t sub_filter_amount_align =
uint64_t sub_filter_amount_align = C_align_32 * C_align_32 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
(uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
uint64_t mult_factor = 0; uint64_t mult_factor = 0;
float average_reciprocal = args.kernel_reciprocal; float average_reciprocal = args.kernel_reciprocal;
uint32_t* kernel_reciprocal; uint32_t *kernel_reciprocal;
kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal)); kernel_reciprocal = (reinterpret_cast<uint32_t *>(&average_reciprocal));
if (args.mode == 1) if (args.mode == 1)
mult_factor = (uint64_t)(*kernel_reciprocal) | mult_factor = (uint64_t)(*kernel_reciprocal) | ((uint64_t)1 << 32) |
((uint64_t)1 << 32) | ((uint64_t)1 << 40); ((uint64_t)1 << 40);
else else
mult_factor = mult_factor =
(uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40); (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO; ret = -EIO;
...@@ -501,7 +504,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -501,7 +504,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
int ret = 0; int ret = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
...@@ -511,7 +514,6 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); ...@@ -511,7 +514,6 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
return ret; return ret;
} }
uint64_t image0_physical_address = 0; uint64_t image0_physical_address = 0;
uint64_t image1_physical_address = 0; uint64_t image1_physical_address = 0;
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
...@@ -519,26 +521,28 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); ...@@ -519,26 +521,28 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
image0_physical_address = vaddr_to_paddr(args.image0.address); image0_physical_address = vaddr_to_paddr(args.image0.address);
image1_physical_address = vaddr_to_paddr(args.image1.address); image1_physical_address = vaddr_to_paddr(args.image1.address);
image_physical_address = image_physical_address =
image0_physical_address | (image1_physical_address << 32); image0_physical_address | (image1_physical_address << 32);
output_physical_address = vaddr_to_paddr(args.output.address); output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
(uint64_t)args.image0.channels, IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t result_addr_row = uint64_t result_addr_row =
output_physical_address | (image_amount_per_row << 32); output_physical_address | (image_amount_per_row << 32);
uint64_t kernel_padding_step = 0; uint64_t kernel_padding_step = 0;
kernel_padding_step = ((uint64_t)args.image0.height * 2) | kernel_padding_step = ((uint64_t)args.image0.height * 2) |
((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48); ((uint64_t)2 << 24) | ((uint64_t)2 << 40) |
uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) | ((uint64_t)1 << 48);
((image_amount_per_row / 32 - 1) << 16) | uint64_t result_size_calcu_height =
(((uint64_t)args.image0.height * 2) << 32); ((uint64_t)args.image0.height - 1) |
uint64_t image_row_col_padding_down = image_amount_per_row | ((image_amount_per_row / 32 - 1) << 16) |
(image_amount_per_row << 32); (((uint64_t)args.image0.height * 2) << 32);
float quantParam = uint64_t image_row_col_padding_down =
((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]); image_amount_per_row | (image_amount_per_row << 32);
uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam); float quantParam =
uint64_t ew_scale_mult_factor = (*ew_scale) | ((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40); uint32_t *ew_scale = reinterpret_cast<uint32_t *>(&quantParam);
uint64_t ew_scale_mult_factor = (*ew_scale) | ((uint64_t)args.const0 << 32) |
((uint64_t)args.const1 << 40);
reg_writeq(0ul, REG_SCALE_PARAMETER); reg_writeq(0ul, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, 0x808); reg_writeq(image_physical_address, 0x808);
reg_writeq(result_addr_row, 0x810); reg_writeq(result_addr_row, 0x810);
...@@ -546,7 +550,7 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); ...@@ -546,7 +550,7 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
reg_writeq(result_size_calcu_height, 0x820); reg_writeq(result_size_calcu_height, 0x820);
reg_writeq(32, 0x828); reg_writeq(32, 0x828);
reg_writeq(image_row_col_padding_down, 0x830); reg_writeq(image_row_col_padding_down, 0x830);
reg_writeq(((image_amount_per_row*2) << 32), 0x838); reg_writeq(((image_amount_per_row * 2) << 32), 0x838);
reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care
reg_writeq(((uint64_t)32 << 32), 0x848); reg_writeq(((uint64_t)32 << 32), 0x848);
reg_writeq(0, 0x858); reg_writeq(0, 0x858);
...@@ -924,7 +928,7 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -924,7 +928,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
<< " pad_height:" << args.image.pad_height << " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width; << " pad_width:" << args.image.pad_width;
DLOG << " filter_address:" << args.filter_address; DLOG << " filter_address:" << args.filter_address;
//<< " bias_address:" << args.bias_address; //<< " bias_address:" << args.bias_address;
DLOG << " kernel_height:" << args.kernel.height DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width << " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h << " stride_h:" << args.kernel.stride_h
...@@ -950,67 +954,71 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -950,67 +954,71 @@ int ComputeDWConv(const struct DWconvArgs &args) {
bias_physical_address = vaddr_to_paddr(args.bias_address); bias_physical_address = vaddr_to_paddr(args.bias_address);
uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64); uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
uint64_t output_height = (uint64_t) uint64_t output_height = (uint64_t)(
((args.image.height + args.image.pad_height * 2 - (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.height) / args.kernel.stride_h +1); args.kernel.stride_h +
uint64_t output_width = (uint64_t) 1);
(((args.image.width + args.image.pad_width * 2 - args.kernel.width) / uint64_t output_width = (uint64_t)(
args.kernel.stride_w + 1) * args.sub_conv_num); ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1) *
args.sub_conv_num);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
(uint64_t)args.image.channels, IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row = uint64_t image_one_pad_per_row =
(uint64_t)args.image.width * (uint64_t)args.image.channels + (uint64_t)args.image.width * (uint64_t)args.image.channels +
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels; (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t result_amount_align_32 = align_to_x( uint64_t result_amount_align_32 =
(uint64_t)output_width * (uint64_t)args.image.channels, 32); align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
uint64_t result_addr_row = uint64_t result_addr_row =
(result_amount_align_32 << 32) | output_physical_address; (result_amount_align_32 << 32) | output_physical_address;
uint64_t row_padding_down = uint64_t row_padding_down =
(uint64_t)args.image.height + (uint64_t)args.image.pad_height; (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1; uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
uint64_t kernel_padding_step = row_padding_down | uint64_t kernel_padding_step = row_padding_down |
((uint64_t)args.image.pad_height << 16) | ((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) | ((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1<<32) | ((uint64_t)kernel_width_sub1 << 32) |
((uint64_t)args.kernel.height << 40) | ((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height-1) << 48); ((uint64_t)(args.kernel.height - 1) << 48);
uint64_t image_calcu_height = (uint64_t)args.kernel.height + uint64_t image_calcu_height =
(output_height - 1) * (uint64_t)args.kernel.stride_h; (uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t result_size_calcu_height = (output_height - 1) | uint64_t result_size_calcu_height = (output_height - 1) |
((output_width - 1) << 16) | (image_calcu_height << 32); ((output_width - 1) << 16) |
uint64_t col_padding_down = ((uint64_t)args.image.width + (image_calcu_height << 32);
(uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; uint64_t col_padding_down =
((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
(uint64_t)args.image.channels;
uint64_t image_row_col_padding_down = uint64_t image_row_col_padding_down =
image_amount_per_row | (col_padding_down << 32); image_amount_per_row | (col_padding_down << 32);
uint64_t image_rowXpadding_h = uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height; image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h = uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h; image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h = uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32); image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w = uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width; (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w = uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w = uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32); channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align = uint64_t filter_row_align = C_align_64 * (uint64_t)args.kernel.width;
C_align_64 * (uint64_t)args.kernel.width; uint64_t sub_filter_amount_align =
uint64_t sub_filter_amount_align = C_align_64 * C_align_64 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
(uint64_t)args.kernel.width *
(uint64_t)args.kernel.height;
uint64_t filter_amount_align = uint64_t filter_amount_align =
sub_filter_amount_align * (uint64_t)args.sub_conv_num; sub_filter_amount_align * (uint64_t)args.sub_conv_num;
uint64_t filter_param = filter_row_align | (filter_amount_align << 16) | uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
(sub_filter_amount_align << 32) | (sub_filter_amount_align << 32) |
(((uint64_t)args.sub_conv_num -1) << 48); (((uint64_t)args.sub_conv_num - 1) << 48);
uint64_t channel_parameter = uint64_t channel_parameter =
(uint64_t)args.image.channels | (C_align_64 << 16); (uint64_t)args.image.channels | (C_align_64 << 16);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO; ret = -EIO;
...@@ -1030,8 +1038,9 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -1030,8 +1038,9 @@ int ComputeDWConv(const struct DWconvArgs &args) {
reg_writeq(channelXpad_w_channelXstep_w, 0x848); reg_writeq(channelXpad_w_channelXstep_w, 0x848);
reg_writeq(filter_physical_address, 0x850); reg_writeq(filter_physical_address, 0x850);
reg_writeq(filter_param, 0x858); reg_writeq(filter_param, 0x858);
reg_writeq(((bias_physical_address+C_align_64*4) | reg_writeq(((bias_physical_address + C_align_64 * 4) |
(bias_physical_address << 32)), 0x860); (bias_physical_address << 32)),
0x860);
cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8); cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
reg_writeq(cmd, 0x800); reg_writeq(cmd, 0x800);
......
...@@ -554,8 +554,8 @@ PMStatus Executor<Device, T>::Predict() { ...@@ -554,8 +554,8 @@ PMStatus Executor<Device, T>::Predict() {
clock_gettime(CLOCK_MONOTONIC, &ts); clock_gettime(CLOCK_MONOTONIC, &ts);
profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif #endif
DLOG << i << "th, " LOG(paddle_mobile::kLOG_INFO) << i << "th, "
<< "run op: " << op_handler->Type(); << "run op: " << op_handler->Type();
if (lod_mode_ && input_dim_has_changed_) { if (lod_mode_ && input_dim_has_changed_) {
op_handler->InferShape(); op_handler->InferShape();
} }
......
...@@ -246,7 +246,7 @@ LOAD_OP2(fusion_conv_bn, CPU, FPGA); ...@@ -246,7 +246,7 @@ LOAD_OP2(fusion_conv_bn, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_bn); LOAD_FUSION_MATCHER(fusion_conv_bn);
#endif #endif
#ifdef ELEMENTWISESUB_OP #ifdef ELEMENTWISESUB_OP
LOAD_OP1(elementwise_sub, CPU) LOAD_OP2(elementwise_sub, CPU, GPU_CL)
#endif #endif
#ifdef TOP_K_OP #ifdef TOP_K_OP
LOAD_OP1(top_k, CPU) LOAD_OP1(top_k, CPU)
...@@ -380,3 +380,9 @@ LOAD_OP1(reduce_prod, CPU); ...@@ -380,3 +380,9 @@ LOAD_OP1(reduce_prod, CPU);
#ifdef PIXEL_SHUFFLE_OP #ifdef PIXEL_SHUFFLE_OP
LOAD_OP1(pixel_shuffle, GPU_CL); LOAD_OP1(pixel_shuffle, GPU_CL);
#endif #endif
#ifdef EXPAND_OP
LOAD_OP1(expand, GPU_CL);
#endif
#ifdef GRID_SAMPLER_OP
LOAD_OP1(grid_sampler, GPU_CL);
#endif
...@@ -262,6 +262,37 @@ void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) { ...@@ -262,6 +262,37 @@ void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
paddle_mobile_->Predict_From_To(start, end); paddle_mobile_->Predict_From_To(start, end);
} }
#else
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::Feed(const std::string &var_name,
const PaddleTensor &input) {
framework::DDim ddim = framework::make_ddim(input.shape);
framework::Tensor input_tensor(static_cast<T *>(input.data.data()), ddim);
paddle_mobile_->Feed(var_name, input_tensor);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::Fetch(const std::string &var_name,
PaddleTensor *output) {
auto output_tensor = paddle_mobile_->Fetch(var_name);
auto ddim = output_tensor->dims();
output->shape.clear();
for (int i = 0; i < ddim.size(); i++) {
output->shape.push_back(static_cast<int>(ddim[i]));
}
int length = output_tensor->numel() * sizeof(T);
if (output->data.length() < length) {
output->data.Resize(length);
}
memcpy(output->data.data(), output_tensor->template data<T>(), length);
}
template <typename Device, typename T>
bool PaddleMobilePredictor<Device, T>::Run() {
paddle_mobile_->Predict();
}
#endif #endif
template <typename Device, typename T> template <typename Device, typename T>
PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() { PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
......
...@@ -39,7 +39,10 @@ class PaddleMobilePredictor : public PaddlePredictor { ...@@ -39,7 +39,10 @@ class PaddleMobilePredictor : public PaddlePredictor {
void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override; void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override;
void FetchPaddleTensors(PaddleTensor* outputs, int id) override; void FetchPaddleTensors(PaddleTensor* outputs, int id) override;
void GetPaddleTensor(const std::string& name, PaddleTensor* output) override; void GetPaddleTensor(const std::string& name, PaddleTensor* output) override;
#else
void Feed(const std::string& var_name, const PaddleTensor& input);
void Fetch(const std::string& var_name, PaddleTensor* output);
bool Run();
#endif #endif
~PaddleMobilePredictor() override; ~PaddleMobilePredictor() override;
......
...@@ -191,6 +191,10 @@ class PaddlePredictor { ...@@ -191,6 +191,10 @@ class PaddlePredictor {
virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0; virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0;
virtual void GetPaddleTensor(const std::string& name, virtual void GetPaddleTensor(const std::string& name,
PaddleTensor* output) = 0; PaddleTensor* output) = 0;
#else
virtual void Feed(const std::string& var_name, const PaddleTensor& input) = 0;
virtual void Fetch(const std::string& var_name, PaddleTensor* output) = 0;
virtual bool Run() = 0;
#endif #endif
protected: protected:
......
...@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators; ...@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp); REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
#endif #endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp);
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef EXPAND_OP
#include "operators/expand_op.h"
#include <framework/ddim.h>
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void ExpandOp<Dtype, T>::InferShape() const {
auto x_dim = this->param_.InputX()->dims();
int expand_size = this->param_.expand_times.size();
int x_dims_size = x_dim.size();
PADDLE_MOBILE_ENFORCE(expand_size == x_dims_size,
"The number of expand_times size must be qual to the "
"rank of Input(X). The number of expand_times size "
"must be qual to the rank of Input(X).")
framework::DDim out_dims(this->param_.InputX()->dims());
for (size_t i = 0; i < this->param_.expand_times.size(); ++i) {
out_dims[i] *= this->param_.expand_times[i];
}
this->param_.Out()->Resize(out_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(expand, ops::ExpandOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef EXPAND_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/expand_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#ifdef EXPAND_OP
DECLARE_OPERATOR(Expand, ExpandParam, ExpandKernel);
#endif
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -45,7 +45,7 @@ class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher { ...@@ -45,7 +45,7 @@ class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FusionInstanceNormReluOp class FusionInstanceNormReluOp
: public framework::OperatorWithKernel< : public framework::OperatorWithKernel<
DeviceType, InstanceNormParam<DeviceType>, DeviceType, FusionInstanceNormReluParam<DeviceType>,
operators::InstanceNormReluKernel<DeviceType, T>> { operators::InstanceNormReluKernel<DeviceType, T>> {
public: public:
FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs, FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
...@@ -53,7 +53,7 @@ class FusionInstanceNormReluOp ...@@ -53,7 +53,7 @@ class FusionInstanceNormReluOp
const framework::AttributeMap &attrs, const framework::AttributeMap &attrs,
framework::Scope *scope) framework::Scope *scope)
: framework::OperatorWithKernel< : framework::OperatorWithKernel<
DeviceType, InstanceNormParam<DeviceType>, DeviceType, FusionInstanceNormReluParam<DeviceType>,
operators::InstanceNormReluKernel<DeviceType, T>>( operators::InstanceNormReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {} type, inputs, outputs, attrs, scope) {}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef GRID_SAMPLER_OP
#include "operators/grid_sampler_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void GridSamplerOp<Dtype, T>::InferShape() const {
auto x_dim = this->param_.InputX()->dims();
this->param_.Output()->Resize(x_dim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(grid_sampler, ops::GridSamplerOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef GRID_SAMPLER_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/grid_sampler_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#ifdef GRID_SAMPLER_OP
DECLARE_OPERATOR(GridSampler, GridSamplerParam, GridSamplerKernel);
#endif
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -24,7 +24,7 @@ namespace operators { ...@@ -24,7 +24,7 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void InstanceNormOp<Dtype, T>::InferShape() const { void InstanceNormOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims(); auto x_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dims); this->param_.OutputY()->Resize(x_dims);
} }
} // namespace operators } // namespace operators
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/arm/convolution/conv_common.h" #include "operators/kernel/arm/convolution/conv_common.h"
#include "framework/context.h" #include "framework/context.h"
#include "operators/math/gemm/gemm1x1s1.h" #include "operators/math/gemm/gemm1x1s1.h"
...@@ -111,3 +113,4 @@ void InitBaseConvKernel(ConvParam<CPU> *param) { ...@@ -111,3 +113,4 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/central-arm-func/conv_arm_func.h" #include "operators/kernel/central-arm-func/conv_arm_func.h"
#include <vector> #include <vector>
...@@ -375,3 +376,4 @@ template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param); ...@@ -375,3 +376,4 @@ template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/cl/cl-kernel-func/conv_func.h" #include "operators/kernel/cl/cl-kernel-func/conv_func.h"
#include <vector> #include <vector>
...@@ -1123,3 +1124,4 @@ void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, ...@@ -1123,3 +1124,4 @@ void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -11,23 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,23 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef INSTANCENORM_OP
#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h" #include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
#include <algorithm> #include <algorithm>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
void InstanceNorm(framework::CLHelper *cl_helper, void InstanceNorm(framework::CLHelper *cl_helper,
const InstanceNormParam<GPU_CL> &param) { const framework::CLImage *input, framework::CLImage *output,
float epsilon) {
auto kernel = cl_helper->KernelAt(0); auto kernel = cl_helper->KernelAt(0);
auto &dims = param.Out()->dims(); auto &dims = output->dims();
const int n = dims[0]; const int n = dims[0];
const int c_group = (dims[1] + 3) / 4; const int c_group = (dims[1] + 3) / 4;
const int h = dims[2]; const int h = dims[2];
const int w = dims[3]; const int w = dims[3];
auto epsilon = param.Epsilon(); auto input_image = input->GetCLImage();
auto input = param.InputX()->GetCLImage(); auto out_image = output->GetCLImage();
auto out = param.Out()->GetCLImage();
// DLOG << "Epsilon: " << epsilon; // DLOG << "Epsilon: " << epsilon;
...@@ -66,12 +66,13 @@ void InstanceNorm(framework::CLHelper *cl_helper, ...@@ -66,12 +66,13 @@ void InstanceNorm(framework::CLHelper *cl_helper,
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon); clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &input); clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &out); clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL, clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
work_size, local_work_size, 0, NULL, NULL); work_size, local_work_size, 0, NULL, NULL);
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -21,7 +21,8 @@ limitations under the License. */ ...@@ -21,7 +21,8 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
void InstanceNorm(framework::CLHelper *cl_helper, void InstanceNorm(framework::CLHelper *cl_helper,
const InstanceNormParam<GPU_CL> &param); const framework::CLImage *input, framework::CLImage *output,
float epsilon);
} }
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 input = read_imageh(inputImage, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = input - biase;
write_imageh(outputImage, coords, output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void expend_c1(
__private const int OUT_C, __private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C, __private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__read_only image2d_t input, __write_only image2d_t output,
__private const int n_times, __private const int c_times,
__private const int h_times, __private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
// const real_in_c = out_c * 4 / c_times;
// const int in_c = real_in_c / 4;
const int in_c = 0;
// const int in_c = out_c / c_times;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 in = read_imageh(input, sampler, input_pos);
in.y = in.x;
in.z = in.x;
in.w = in.x;
write_imageh(output, output_pos, in);
}
__kernel void expend_c2(
__private const int OUT_C, __private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C, __private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__read_only image2d_t input, __write_only image2d_t output,
__private const int n_times, __private const int c_times,
__private const int h_times, __private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
// const real_in_c = out_c * 4 / c_times;
// const int in_c = real_in_c / 4;
const int in_c = 0;
// const int in_c = out_c / c_times;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 in = read_imageh(input, sampler, input_pos);
in.z = in.x;
in.w = in.y;
write_imageh(output, output_pos, in);
}
__kernel void expend_c4(
__private const int OUT_C, __private const int OUT_W,
__private const int OUT_NH,
__private const int IN_C, __private const int IN_W,
__private const int IN_NH,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__read_only image2d_t input, __write_only image2d_t output,
__private const int n_times, __private const int c_times,
__private const int h_times, __private const int w_times) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
return;
}
const int out_n = out_nh / output_height;
const int out_h = out_nh % output_height;
// const real_in_c = out_c * 4 / c_times;
// const int in_c = real_in_c / 4;
const int in_c = 0;
// const int in_c = out_c / c_times;
const int in_w = out_w / w_times;
const int in_h = out_h / h_times;
const int in_n = out_n / n_times;
const int in_nh = in_n * input_height + in_h;
int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 in = read_imageh(input, sampler, input_pos);
write_imageh(output, output_pos, in);
}
\ No newline at end of file
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cl_common.h"
__kernel void grid_sampler(__private const int out_height,
__private const int out_width,
__read_only image2d_t input,
__read_only image2d_t grid,
__write_only image2d_t output) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2) * 4;
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int x_grid = out_h / 4 * 2;
int y_grid = out_n * out_width + out_w;
float4 g1 = read_imagef(grid, sampler, (int2)(x_grid, y_grid));
float4 g2 = read_imagef(grid, sampler, (int2)(x_grid + 1, y_grid));
float x = (g1.x + 1) * (out_width - 1) / 2;
float y = (g2.x + 1) * (out_height - 1) / 2;
float x0 = floor(x);
float y0 = floor(y);
int x_p = out_c * out_width + x0;
int y_p = out_n * out_height + y0;
int x_out = out_c * out_width + out_w;
int y_out = out_n * out_height + out_h;
float4 input0 = read_imagef(input, sampler, (int2)(x_p, y_p));
float4 input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
float4 input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1));
float4 input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
float4 out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
input1 * (x - x0) * (y0 + 1 - y) +
input2 * (x0 + 1 - x) * (y - y0) +
input3 * (x - x0) * (y - y0);
write_imageh(output, (int2)(x_out, y_out), convert_half4(out_val));
x = (g1.y + 1) * (out_width - 1) / 2;
y = (g2.y + 1) * (out_height - 1) / 2;
x0 = floor(x);
y0 = floor(y);
x_p = out_c * out_width + x0;
y_p = out_n * out_height + y0;
input0 = read_imagef(input, sampler, (int2)(x_p, y_p));
input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1));
input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
input1 * (x - x0) * (y0 + 1 - y) +
input2 * (x0 + 1 - x) * (y - y0) +
input3 * (x - x0) * (y - y0);
write_imageh(output, (int2)(x_out, y_out + 1), convert_half4(out_val));
x = (g1.z + 1) * (out_width - 1) / 2;
y = (g2.z + 1) * (out_height - 1) / 2;
x0 = floor(x);
y0 = floor(y);
x_p = out_c * out_width + x0;
y_p = out_n * out_height + y0;
input0 = read_imagef(input, sampler, (int2)(x_p, y_p));
input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1));
input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
input1 * (x - x0) * (y0 + 1 - y) +
input2 * (x0 + 1 - x) * (y - y0) +
input3 * (x - x0) * (y - y0);
write_imageh(output, (int2)(x_out, y_out + 2), convert_half4(out_val));
x = (g1.w + 1) * (out_width - 1) / 2;
y = (g2.w + 1) * (out_height - 1) / 2;
x0 = floor(x);
y0 = floor(y);
x_p = out_c * out_width + x0;
y_p = out_n * out_height + y0;
input0 = read_imagef(input, sampler, (int2)(x_p, y_p));
input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
input2 = read_imagef(input, sampler, (int2)(x_p, y_p + 1));
input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
input1 * (x - x0) * (y0 + 1 - y) +
input2 * (x0 + 1 - x) * (y - y0) +
input3 * (x - x0) * (y - y0);
write_imageh(output, (int2)(x_out, y_out + 3), convert_half4(out_val));
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISESUB_OP
#include "operators/kernel/elementwise_sub_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ElementwiseSubKernel<GPU_CL, float>::Init(
ElementwiseSubParam<GPU_CL> *param) {
framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
const_cast<framework::CLImage *>(param->InputY()));
if (bias->dims().size() == 4) {
if (!bias->isInit()) {
bias->InitNormalCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
}
DLOG << " bias: " << *bias;
this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl");
} else {
DLOG << "error:bias dims not support";
}
return true;
}
template <>
void ElementwiseSubKernel<GPU_CL, float>::Compute(
const ElementwiseSubParam<GPU_CL> &param) {
auto input = param.InputX();
auto bias = param.InputY();
auto output = param.Out();
cl_int status;
auto kernel = this->cl_helper_.KernelAt(0);
if (bias->dims().size() == 4) {
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
DLOG << "error:bias dims not support";
}
}
template class ElementwiseSubKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef EXPAND_OP
#include "operators/kernel/expand_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ExpandKernel<GPU_CL, float>::Init(ExpandParam<GPU_CL>* param) {
const framework::DDim& input_dims = param->InputX()->dims();
PADDLE_MOBILE_ENFORCE(input_dims.size() == 4,
"expend now support 4 size dims");
if (input_dims[1] == 1) {
this->cl_helper_.AddKernel("expend_c1", "expend.cl");
} else if (input_dims[1] == 2) {
this->cl_helper_.AddKernel("expend_c2", "expend.cl");
} else if (input_dims[1] == 4) {
this->cl_helper_.AddKernel("expend_c4", "expend.cl");
} else {
PADDLE_MOBILE_ENFORCE(false, "expend did not supported this type");
}
return true;
}
template <>
void ExpandKernel<GPU_CL, float>::Compute(const ExpandParam<GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0);
DLOG << "param.Out()->dims(): " << param.Out()->dims();
const framework::DDim& image_dims = param.Out()->ImageDims();
DLOG << "param.Out()->image_dims(): " << image_dims;
auto out_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
DLOG << "out_work_size: " << out_work_size;
int out_c_block = out_work_size[0];
int out_w = out_work_size[1];
int out_nh = out_work_size[2];
auto in_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
int in_c_block = in_work_size[0];
int in_w = in_work_size[1];
int in_nh = in_work_size[2];
int input_width = param.InputX()->dims()[3];
int input_height = param.InputX()->dims()[2];
int output_width = param.Out()->dims()[3];
int output_height = param.Out()->dims()[2];
const auto* input = param.InputX();
auto* output = param.Out();
vector<int> expandTimes = {1, 1, 1, 1};
DLOG << "param.expand_times: " << param.expand_times;
for (int i = 0; i < param.expand_times.size(); ++i) {
expandTimes[i] = param.expand_times[i];
}
DLOG << "expandTimes: " << expandTimes;
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
input->dims();
int idx = 0;
cl_int status;
status = clSetKernelArg(kernel, idx++, sizeof(int), &out_c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &out_w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &out_nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &in_c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &in_w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &in_nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[0]);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[1]);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[2]);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[3]);
CL_CHECK_ERRORS(status);
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
out_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
DLOG << *output;
}
template class ExpandKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef GRID_SAMPLER_OP
#include "operators/kernel/grid_sampler_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool GridSamplerKernel<GPU_CL, float>::Init(GridSamplerParam<GPU_CL>* param) {
this->cl_helper_.AddKernel("grid_sampler", "grid_sampler_kernel.cl");
return true;
}
template <>
void GridSamplerKernel<GPU_CL, float>::Compute(
const GridSamplerParam<GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Output()));
cl_int status;
auto output = param.Output();
auto input = param.InputX();
auto grid = param.Grid();
auto output_image = output->GetCLImage();
auto input_image = input->GetCLImage();
auto grid_image = grid->GetCLImage();
const int out_H = output->dims()[2];
const int out_W = output->dims()[3];
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &grid_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
const size_t work_size[3] = {default_work_size[0], default_work_size[1],
default_work_size[2] / 4};
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3,
NULL, work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class GridSamplerKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -23,7 +23,7 @@ namespace operators { ...@@ -23,7 +23,7 @@ namespace operators {
template <> template <>
bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) { bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
auto &dims = param->Out()->dims(); auto &dims = param->OutputY()->dims();
const int h = dims[2]; const int h = dims[2];
std::string build_options = ""; std::string build_options = "";
if (h == 128) { if (h == 128) {
...@@ -41,7 +41,8 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) { ...@@ -41,7 +41,8 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
template <> template <>
void InstanceNormKernel<GPU_CL, float>::Compute( void InstanceNormKernel<GPU_CL, float>::Compute(
const InstanceNormParam<GPU_CL> &param) { const InstanceNormParam<GPU_CL> &param) {
InstanceNorm(&this->cl_helper_, param); InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(),
param.Epsilon());
} }
template class InstanceNormKernel<GPU_CL, float>; template class InstanceNormKernel<GPU_CL, float>;
......
...@@ -23,7 +23,7 @@ namespace operators { ...@@ -23,7 +23,7 @@ namespace operators {
template <> template <>
bool InstanceNormReluKernel<GPU_CL, float>::Init( bool InstanceNormReluKernel<GPU_CL, float>::Init(
InstanceNormParam<GPU_CL> *param) { FusionInstanceNormReluParam<GPU_CL> *param) {
auto &dims = param->Out()->dims(); auto &dims = param->Out()->dims();
const int h = dims[2]; const int h = dims[2];
std::string build_options = "-DRELU"; std::string build_options = "-DRELU";
...@@ -41,8 +41,8 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init( ...@@ -41,8 +41,8 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
template <> template <>
void InstanceNormReluKernel<GPU_CL, float>::Compute( void InstanceNormReluKernel<GPU_CL, float>::Compute(
const InstanceNormParam<GPU_CL> &param) { const FusionInstanceNormReluParam<GPU_CL> &param) {
InstanceNorm(&this->cl_helper_, param); InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon());
} }
template class InstanceNormReluKernel<GPU_CL, float>; template class InstanceNormReluKernel<GPU_CL, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#ifdef EXPAND_OP
DECLARE_KERNEL(Expand, ExpandParam);
#endif // EXPAND_OP
} // namespace operators
} // namespace paddle_mobile
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef ELEMENTWISEADD_OP #ifdef ELEMENTWISEADD_OP
#include <math.h>
#include "operators/kernel/elementwise_add_kernel.h" #include "operators/kernel/elementwise_add_kernel.h"
#include <math.h>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -62,14 +62,14 @@ void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) { ...@@ -62,14 +62,14 @@ void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
int inputh = ewaddArgs.image0.height; int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width; int inputw = ewaddArgs.image0.width;
float inScale0 = float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
float inScale1 = float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
float outScale = float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address); int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address); int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address); int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw; int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale; float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale; float const1 = inScale1 / outScale;
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_ELEMENTWISEADDRELU_OP #ifdef FUSION_ELEMENTWISEADDRELU_OP
#include <math.h>
#include "operators/kernel/elementwise_add_relu_kernel.h" #include "operators/kernel/elementwise_add_relu_kernel.h"
#include <math.h>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -63,14 +63,14 @@ void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) { ...@@ -63,14 +63,14 @@ void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
int inputh = ewaddArgs.image0.height; int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width; int inputw = ewaddArgs.image0.width;
float inScale0 = float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
float inScale1 = float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
float outScale = float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address); int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address); int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address); int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw; int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale; float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale; float const1 = inScale1 / outScale;
......
...@@ -331,7 +331,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -331,7 +331,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
keep_nms.Resize({post_nms_top_n}); keep_nms.Resize({post_nms_top_n});
} }
proposals.mutable_data<T>({keep_nms.numel(), 4}); // original proposals.mutable_data<T>({keep_nms.numel(), 4}); // original
scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1}); // original scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1}); // original
CPUGather<T>(bbox_sel, keep_nms, &proposals); CPUGather<T>(bbox_sel, keep_nms, &proposals);
...@@ -371,8 +371,8 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -371,8 +371,8 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
for (int h = 0; h < score_height; h++) { for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) { for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; ++c) { for (int c = 0; c < score_channels; ++c) {
int dstidx = h*unalignedCW + w*score_channels + c; int dstidx = h * unalignedCW + w * score_channels + c;
int srcidx = h*alignedCW + w*score_channels + c; int srcidx = h * alignedCW + w * score_channels + c;
score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx]; score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx];
} }
} }
...@@ -388,11 +388,11 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -388,11 +388,11 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
for (int h = 0; h < bbox_height; h++) { for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) { for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; ++c) { for (int c = 0; c < bbox_channels; ++c) {
int dstidx = h*unalignedCW + w*bbox_channels + c; int dstidx = h * unalignedCW + w * bbox_channels + c;
int srcidx = h*alignedCW + w*bbox_channels + c; int srcidx = h * alignedCW + w * bbox_channels + c;
bbox_tensor->data<float>()[dstidx] = bbox_tensor->data<float>()[dstidx] =
(static_cast<int>(input_bbox_data[srcidx]))/127.0* (static_cast<int>(input_bbox_data[srcidx])) / 127.0 *
input_bbox->scale[0]; input_bbox->scale[0];
} }
} }
} }
...@@ -412,14 +412,14 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -412,14 +412,14 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
float min_size = param.min_size_; float min_size = param.min_size_;
float eta = param.eta_; float eta = param.eta_;
rpn_rois->mutable_data<float>({bbox_tensor->numel()/4, 4}); rpn_rois->mutable_data<float>({bbox_tensor->numel() / 4, 4});
rpn_roi_probs->mutable_data<int8_t>({input_score->numel()/4, 1}); rpn_roi_probs->mutable_data<int8_t>({input_score->numel() / 4, 1});
framework::LoD lod; framework::LoD lod;
lod.resize(1); lod.resize(1);
auto &lod0 = lod[0]; auto &lod0 = lod[0];
lod0.push_back(0); lod0.push_back(0);
anchors.Resize({anchors.numel()/4, 4}); anchors.Resize({anchors.numel() / 4, 4});
variances.Resize({variances.numel()/4, 4}); variances.Resize({variances.numel() / 4, 4});
int64_t num_proposals = 0; int64_t num_proposals = 0;
for (int64_t i = 0; i < score_n; ++i) { for (int64_t i = 0; i < score_n; ++i) {
......
...@@ -143,7 +143,6 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -143,7 +143,6 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"the channels of input X should equal the product of " "the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"); "output_channels x pooled_height x pooled_width");
auto output_data = out->mutable_data<float>(); auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>(); auto input_rois = rois->data<float>();
...@@ -173,11 +172,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -173,11 +172,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
for (int ph = 0; ph < pooled_height; ph++) { for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) { for (int pw = 0; pw < pooled_width; pw++) {
PSROIPoolingForward<float>( PSROIPoolingForward<float>(input_data, height, width, input_channels,
input_data, height, width, input_channels, offset_output_data, offset_output_data, pooled_height,
pooled_height, pooled_width, output_channels, input_rois, pooled_width, output_channels, input_rois,
bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph, bin_size_h, bin_size_w, roi_start_h,
scale, roi_batch_ind); roi_start_w, pw, ph, scale, roi_batch_ind);
} }
} }
} }
......
...@@ -118,11 +118,10 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) { ...@@ -118,11 +118,10 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
auto inputdimsize = input->dims().size(); auto inputdimsize = input->dims().size();
auto outputdimsize = output->dims().size(); auto outputdimsize = output->dims().size();
int smallersize = int smallersize =
inputdimsize > outputdimsize ? outputdimsize : inputdimsize; inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
int i = 0; int i = 0;
for (i = 0; i < smallersize; i++) { for (i = 0; i < smallersize; i++) {
if ((input->dims())[i] != (output->dims())[i]) if ((input->dims())[i] != (output->dims())[i]) break;
break;
} }
if (i == smallersize) { if (i == smallersize) {
reshapeNeedFlg = 0; reshapeNeedFlg = 0;
......
...@@ -57,31 +57,30 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) { ...@@ -57,31 +57,30 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
int len = end - start; int len = end - start;
size_t size = len * sizeof(int8_t); size_t size = len * sizeof(int8_t);
DLOG << input->fpga_data_num; DLOG << input->fpga_data_num;
fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t)); fpga::fpga_invalidate(input_ptr, input->fpga_data_num * sizeof(int8_t));
DLOG << output->fpga_data_num; DLOG << output->fpga_data_num;
fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t)); fpga::fpga_invalidate(output_ptr, output->fpga_data_num * sizeof(int8_t));
int unalignedWC = len * W; int unalignedWC = len * W;
int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT); int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
if (unalignedWC != alignedWC) { if (unalignedWC != alignedWC) {
auto tmpOutput = reinterpret_cast<int8_t*> auto tmpOutput =
(fpga::fpga_malloc(len*HW * sizeof(int8_t))); reinterpret_cast<int8_t*>(fpga::fpga_malloc(len * HW * sizeof(int8_t)));
for (int i = 0; i < HW; i++) { for (int i = 0; i < HW; i++) {
memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size); memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
}
for (int i = 0; i < H; i++) {
for (int j = 0; j < unalignedWC; j++) {
*(output_ptr + alignedWC * i + j) = *(tmpOutput + unalignedWC * i + j);
} }
for (int i = 0; i < H; i++) { }
for (int j = 0; j < unalignedWC; j++) { fpga::fpga_free(tmpOutput);
*(output_ptr + alignedWC * i + j) =
*(tmpOutput + unalignedWC * i + j);
}
}
fpga::fpga_free(tmpOutput);
} else { } else {
for (int i = 0; i < HW; i++) { for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
} }
} }
fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t)); fpga::fpga_flush(output_ptr, output->fpga_data_num * sizeof(int8_t));
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
#ifdef GRID_SAMPLER_OP
DECLARE_KERNEL(GridSampler, GridSamplerParam);
#endif // GRID_SAMPLER_OP
} // namespace operators
} // namespace paddle_mobile
...@@ -30,10 +30,10 @@ using framework::OpKernelBase; ...@@ -30,10 +30,10 @@ using framework::OpKernelBase;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class InstanceNormReluKernel class InstanceNormReluKernel
: public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> { : public OpKernelBase<DeviceType, FusionInstanceNormReluParam<DeviceType>> {
public: public:
void Compute(const InstanceNormParam<DeviceType> &param); void Compute(const FusionInstanceNormReluParam<DeviceType> &param);
bool Init(InstanceNormParam<DeviceType> *param); bool Init(FusionInstanceNormReluParam<DeviceType> *param);
}; };
} // namespace operators } // namespace operators
......
...@@ -24,8 +24,9 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const { ...@@ -24,8 +24,9 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
"Input(X) of BilinearInterOp should not be null."); "Input(X) of BilinearInterOp should not be null.");
PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr, PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
"Output(Out) of BilinearInterOp should not be null."); "Output(Out) of BilinearInterOp should not be null.");
auto dim_x = this->param_.InputX()->dims(); // NCHW format auto dim_x = this->param_.InputX()->dims(); // NCHW format
DLOG << "dim_x :" << dim_x;
int out_h = this->param_.OutH(); int out_h = this->param_.OutH();
int out_w = this->param_.OutW(); int out_w = this->param_.OutW();
PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4"); PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
...@@ -37,8 +38,22 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const { ...@@ -37,8 +38,22 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
"OutSize's dimension size must be 1"); "OutSize's dimension size must be 1");
PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2"); PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
} }
std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
this->param_.Out()->Resize(framework::make_ddim(dim_out)); DLOG << "this->param_.HasScale(): " << this->param_.HasScale();
if (this->param_.HasScale()) {
const float scale = this->param_.Scale();
DLOG << "scale_: " << scale;
std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
static_cast<int>(dim_x[2] * scale),
static_cast<int>(dim_x[3] * scale)});
this->param_.Out()->Resize(framework::make_ddim(dim_out));
DLOG << "interp -- dim_out: " << dim_out;
} else {
std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
this->param_.Out()->Resize(framework::make_ddim(dim_out));
DLOG << "interp -- dim_out: " << dim_out;
}
} }
} // namespace operators } // namespace operators
......
...@@ -337,6 +337,11 @@ class OpParam { ...@@ -337,6 +337,11 @@ class OpParam {
return GetVarValue<T>("Filter", inputs, scope); return GetVarValue<T>("Filter", inputs, scope);
} }
template <typename T>
static T *GridFrom(const VariableNameMap &inputs, const Scope &scope) {
return GetVarValue<T>("Grid", inputs, scope);
}
template <typename T> template <typename T>
static const T GetAttr(const string &key, const AttributeMap &map) { static const T GetAttr(const string &key, const AttributeMap &map) {
return ((Attribute)map.at(key)).Get<T>(); return ((Attribute)map.at(key)).Get<T>();
...@@ -927,6 +932,35 @@ class InstanceNormParam : public OpParam { ...@@ -927,6 +932,35 @@ class InstanceNormParam : public OpParam {
Scope *scope) Scope *scope)
: OpParam(inputs, outputs, attrs, scope) { : OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope); input_x_ = InputXFrom<GType>(inputs, *scope);
output_y_ = OutputYFrom<GType>(outputs, *scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
}
const GType *InputX() const { return input_x_; }
GType *OutputY() const { return output_y_; }
const float &Epsilon() const { return epsilon_; }
private:
GType *input_x_;
GType *output_y_;
float epsilon_;
};
#endif
#ifdef FUSION_INSTANCENORM_RELU_OP
template <typename Dtype>
class FusionInstanceNormReluParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionInstanceNormReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
out_ = OutFrom<GType>(outputs, *scope); out_ = OutFrom<GType>(outputs, *scope);
epsilon_ = GetAttr<float>("epsilon", attrs); epsilon_ = GetAttr<float>("epsilon", attrs);
} }
...@@ -3008,7 +3042,7 @@ class SplitParam : public OpParam { ...@@ -3008,7 +3042,7 @@ class SplitParam : public OpParam {
int axis; int axis;
int num; int num;
std::vector<int> sections; std::vector<int> sections;
// std::vector<GType> out_ts_; // std::vector<GType> out_ts_;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
...@@ -3069,12 +3103,20 @@ class NearestInterpolationParam : public OpParam { ...@@ -3069,12 +3103,20 @@ class NearestInterpolationParam : public OpParam {
out_ = OutFrom<GType>(outputs, *scope); out_ = OutFrom<GType>(outputs, *scope);
out_h_ = GetAttr<int>("out_h", attrs); out_h_ = GetAttr<int>("out_h", attrs);
out_w_ = GetAttr<int>("out_w", attrs); out_w_ = GetAttr<int>("out_w", attrs);
if (HasAttr("scale", attrs)) {
has_scale_ = true;
scale_ = GetAttr<float>("scale", attrs);
}
DLOG << "has_scale_: " << has_scale_;
DLOG << "scale_: " << scale_;
} }
const GType *InputX() const { return input_x_; } const GType *InputX() const { return input_x_; }
const GType *InputOutPutSize() const { return input_outsize_; } const GType *InputOutPutSize() const { return input_outsize_; }
GType *Out() const { return out_; } GType *Out() const { return out_; }
int OutH() const { return out_h_; } int OutH() const { return out_h_; }
int OutW() const { return out_w_; } int OutW() const { return out_w_; }
float Scale() const { return scale_; }
bool HasScale() const { return has_scale_; }
private: private:
GType *input_x_; GType *input_x_;
...@@ -3082,6 +3124,8 @@ class NearestInterpolationParam : public OpParam { ...@@ -3082,6 +3124,8 @@ class NearestInterpolationParam : public OpParam {
GType *out_; GType *out_;
int out_h_; int out_h_;
int out_w_; int out_w_;
float scale_;
bool has_scale_;
}; };
#endif #endif
...@@ -3658,5 +3702,60 @@ class PixelShuffleParam : public OpParam { ...@@ -3658,5 +3702,60 @@ class PixelShuffleParam : public OpParam {
}; };
#endif #endif
#ifdef GRID_SAMPLER_OP
template <typename Dtype>
class GridSamplerParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
GridSamplerParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
grid_ = GridFrom<GType>(inputs, *scope);
output_ = OutputFrom<GType>(outputs, *scope);
}
const GType *InputX() const { return input_x_; }
const GType *Grid() const { return grid_; }
GType *Output() const { return output_; }
private:
GType *input_x_;
GType *grid_;
GType *output_;
};
#endif
#ifdef EXPAND_OP
template <typename Dtype>
class ExpandParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
out_ = OutFrom<GType>(outputs, *scope);
expand_times = OpParam::GetAttr<std::vector<int>>("expand_times", attrs);
}
const GType *InputX() const { return input_x_; }
GType *Out() const { return out_; }
std::vector<int> expand_times;
private:
GType *input_x_;
GType *out_;
};
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -45,7 +45,7 @@ if (CON GREATER -1) ...@@ -45,7 +45,7 @@ if (CON GREATER -1)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
# gen test # gen test
ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
target_link_libraries(test-mobilenetgpu paddle-mobile) target_link_libraries(test-mobilenetgpu paddle-mobile)
endif () endif ()
...@@ -105,7 +105,7 @@ if (CON GREATER -1) ...@@ -105,7 +105,7 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp) ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp)
target_link_libraries(test-marker-api paddle-mobile) target_link_libraries(test-marker-api paddle-mobile)
#ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h ) #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
#target_link_libraries(test-marker2 paddle-mobile) #target_link_libraries(test-marker2 paddle-mobile)
...@@ -193,13 +193,16 @@ endif () ...@@ -193,13 +193,16 @@ endif ()
list(FIND NET "op" CON) list(FIND NET "op" CON)
if (CON GREATER -1) if (CON GREATER -1)
# gen test # # gen test
ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h) # ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
target_link_libraries(test-sigmoid paddle-mobile) # target_link_libraries(test-sigmoid paddle-mobile)
#
# # gen test log
# ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
# target_link_libraries(test-leakyrelu paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp) ADD_EXECUTABLE(test-log common/test_log.cpp)
target_link_libraries(test-leakyrelu paddle-mobile) target_link_libraries(test-log paddle-mobile)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif () endif ()
...@@ -208,342 +211,355 @@ if (ENABLE_ALL_TEST) ...@@ -208,342 +211,355 @@ if (ENABLE_ALL_TEST)
# gen test # gen test
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet paddle-mobile) target_link_libraries(test-resnet paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-squeezenet paddle-mobile) target_link_libraries(test-squeezenet paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile) target_link_libraries(test-yolo paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test_yolo_combined paddle-mobile) target_link_libraries(test_yolo_combined paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-op-in-net net/test_op_in_net.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-op-in-net net/test_op_in_net.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-op-in-net paddle-mobile) target_link_libraries(test-op-in-net paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-googlenet paddle-mobile) target_link_libraries(test-googlenet paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-googlenet-quali paddle-mobile) target_link_libraries(test-googlenet-quali paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-op paddle-mobile) target_link_libraries(test-conv-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-expend-op operators/test_expend_op.cpp test_helper.h test_include.h executor_for_test_opencl.h)
target_link_libraries(test-expend-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
target_link_libraries(test-mul-op paddle-mobile) target_link_libraries(test-mul-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
target_link_libraries(test-elementwiseadd-op paddle-mobile) target_link_libraries(test-elementwiseadd-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h)
target_link_libraries(test-elementwisesub-op paddle-mobile) target_link_libraries(test-elementwisesub-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h)
target_link_libraries(test-im2sequence-op paddle-mobile) target_link_libraries(test-im2sequence-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
target_link_libraries(test-concat-op paddle-mobile) target_link_libraries(test-concat-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
target_link_libraries(test-lrn-op paddle-mobile) target_link_libraries(test-lrn-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
target_link_libraries(test-batchnorm-op paddle-mobile) target_link_libraries(test-batchnorm-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
target_link_libraries(test-priorbox-op paddle-mobile) target_link_libraries(test-priorbox-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
target_link_libraries(test-boxcoder-op paddle-mobile) target_link_libraries(test-boxcoder-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
target_link_libraries(test-transpose-op paddle-mobile) target_link_libraries(test-transpose-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
target_link_libraries(test-transpose2-op paddle-mobile) target_link_libraries(test-transpose2-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
target_link_libraries(test-multiclassnms-op paddle-mobile) target_link_libraries(test-multiclassnms-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
target_link_libraries(test-polygon-box-transform-op paddle-mobile) target_link_libraries(test-polygon-box-transform-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h)
target_link_libraries(test-fill-constant-op paddle-mobile) target_link_libraries(test-fill-constant-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
target_link_libraries(test-reshape-op paddle-mobile) target_link_libraries(test-reshape-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h)
target_link_libraries(test-reshape2-op paddle-mobile) target_link_libraries(test-reshape2-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
target_link_libraries(test-relu-op paddle-mobile) target_link_libraries(test-relu-op paddle-mobile)
ADD_EXECUTABLE(test-relu6-op operators/test_relu6_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-relu6-op operators/test_relu6_op.cpp test_helper.h test_include.h)
target_link_libraries(test-relu6-op paddle-mobile) target_link_libraries(test-relu6-op paddle-mobile)
ADD_EXECUTABLE(test-tanh-op operators/test_tanh_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-tanh-op operators/test_tanh_op.cpp test_helper.h test_include.h)
target_link_libraries(test-tanh-op paddle-mobile) target_link_libraries(test-tanh-op paddle-mobile)
ADD_EXECUTABLE(test-log-op operators/test_log_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-log-op operators/test_log_op.cpp test_helper.h test_include.h)
target_link_libraries(test-log-op paddle-mobile) target_link_libraries(test-log-op paddle-mobile)
ADD_EXECUTABLE(test-topk-op operators/test_topk_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-topk-op operators/test_topk_op.cpp test_helper.h test_include.h)
target_link_libraries(test-topk-op paddle-mobile) target_link_libraries(test-topk-op paddle-mobile)
ADD_EXECUTABLE(test-cast-op operators/test_cast_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-cast-op operators/test_cast_op.cpp test_helper.h test_include.h)
target_link_libraries(test-cast-op paddle-mobile) target_link_libraries(test-cast-op paddle-mobile)
ADD_EXECUTABLE(test-less-than-op operators/test_less_than_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-less-than-op operators/test_less_than_op.cpp test_helper.h test_include.h)
target_link_libraries(test-less-than-op paddle-mobile) target_link_libraries(test-less-than-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
target_link_libraries(test-fc-op paddle-mobile) target_link_libraries(test-fc-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h)
target_link_libraries(test-sum-op paddle-mobile) target_link_libraries(test-sum-op paddle-mobile)
# test quantize op # test quantize op
ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
target_link_libraries(test-quantize-op paddle-mobile) target_link_libraries(test-quantize-op paddle-mobile)
# test dequantize op # test dequantize op
ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
target_link_libraries(test-dequantize-op paddle-mobile) target_link_libraries(test-dequantize-op paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-log common/test_log.cpp) ADD_EXECUTABLE(test-log common/test_log.cpp)
target_link_libraries(test-log paddle-mobile) target_link_libraries(test-log paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-load framework/test_load.cpp) ADD_EXECUTABLE(test-load framework/test_load.cpp)
target_link_libraries(test-load paddle-mobile) target_link_libraries(test-load paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp) ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
target_link_libraries(test-loadmemory paddle-mobile) target_link_libraries(test-loadmemory paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp) ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp)
target_link_libraries(test-loadmemory-inference paddle-mobile) target_link_libraries(test-loadmemory-inference paddle-mobile)
ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp) ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
target_link_libraries(test-inference-api paddle-mobile) target_link_libraries(test-inference-api paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp) ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
target_link_libraries(test-optimize paddle-mobile) target_link_libraries(test-optimize paddle-mobile)
#gen test #gen test
ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool-op paddle-mobile) target_link_libraries(test-pool-op paddle-mobile)
#gen test #gen test
ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-softmax-op paddle-mobile) target_link_libraries(test-softmax-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp) ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
target_link_libraries(test-gemm-accuracy paddle-mobile) target_link_libraries(test-gemm-accuracy paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp) ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
target_link_libraries(test-gemm-int8-accuracy paddle-mobile) target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp) ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
target_link_libraries(test-gemm-perf paddle-mobile) target_link_libraries(test-gemm-perf paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-enforce common/test_enforce.cpp) ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
target_link_libraries(test-enforce paddle-mobile) target_link_libraries(test-enforce paddle-mobile)
# gen test - test if openmp works # gen test - test if openmp works
ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-openmp paddle-mobile) target_link_libraries(test-openmp paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenetssd paddle-mobile) target_link_libraries(test-mobilenetssd paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenet-combine paddle-mobile) target_link_libraries(test-mobilenet-combine paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-genet paddle-mobile) target_link_libraries(test-genet paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-sigmoid-op operators/test_sigmoid_op.cpp test_include.h) ADD_EXECUTABLE(test-sigmoid-op operators/test_sigmoid_op.cpp test_include.h)
target_link_libraries(test-sigmoid-op paddle-mobile) target_link_libraries(test-sigmoid-op paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp) ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
target_link_libraries(test-leakyrelu paddle-mobile) target_link_libraries(test-leakyrelu paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-depthwise-conv-op paddle-mobile) target_link_libraries(test-depthwise-conv-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenet paddle-mobile) target_link_libraries(test-mobilenet paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile) target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-bn-relu-op paddle-mobile) target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-nlp paddle-mobile) target_link_libraries(test-nlp paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
target_link_libraries(test-gru-op paddle-mobile) target_link_libraries(test-gru-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-inceptionv4 paddle-mobile) target_link_libraries(test-inceptionv4 paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-alexnet paddle-mobile) target_link_libraries(test-alexnet paddle-mobile)
ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
target_link_libraries(test-googlenetv1 paddle-mobile) target_link_libraries(test-googlenetv1 paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
target_link_libraries(test-fssd paddle-mobile) target_link_libraries(test-fssd paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
target_link_libraries(test-mobilenetgpu paddle-mobile) target_link_libraries(test-mobilenetgpu paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yologpu paddle-mobile) target_link_libraries(test-yologpu paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
target_link_libraries(test-multi-process paddle-mobile) target_link_libraries(test-multi-process paddle-mobile)
# gen test benchmark # gen test benchmark
ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp) ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp)
target_link_libraries(test-benchmark paddle-mobile) target_link_libraries(test-benchmark paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
target_link_libraries(test-eng paddle-mobile) target_link_libraries(test-eng paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
target_link_libraries(test-super paddle-mobile) target_link_libraries(test-super paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
target_link_libraries(test-ocr paddle-mobile) target_link_libraries(test-ocr paddle-mobile)
ADD_EXECUTABLE(test-gesture net/test_gesture.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-gesture net/test_gesture.cpp test_helper.h test_include.h)
target_link_libraries(test-gesture paddle-mobile) target_link_libraries(test-gesture paddle-mobile)
ADD_EXECUTABLE(test-sequence-expand-op operators/test_sequence_expand_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-sequence-expand-op operators/test_sequence_expand_op.cpp test_helper.h test_include.h)
target_link_libraries(test-sequence-expand-op paddle-mobile) target_link_libraries(test-sequence-expand-op paddle-mobile)
ADD_EXECUTABLE(test-sequence-pool-op operators/test_sequence_pool_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-sequence-pool-op operators/test_sequence_pool_op.cpp test_helper.h test_include.h)
target_link_libraries(test-sequence-pool-op paddle-mobile) target_link_libraries(test-sequence-pool-op paddle-mobile)
ADD_EXECUTABLE(test-sequence-softmax-op operators/test_sequence_softmax_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-sequence-softmax-op operators/test_sequence_softmax_op.cpp test_helper.h test_include.h)
target_link_libraries(test-sequence-softmax-op paddle-mobile) target_link_libraries(test-sequence-softmax-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h)
target_link_libraries(test-vgg16ssd paddle-mobile) target_link_libraries(test-vgg16ssd paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h)
target_link_libraries(test-logical-and-op paddle-mobile) target_link_libraries(test-logical-and-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h)
target_link_libraries(test-logical-or-op paddle-mobile) target_link_libraries(test-logical-or-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h)
target_link_libraries(test-logical-not-op paddle-mobile) target_link_libraries(test-logical-not-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h)
target_link_libraries(test-logical-xor-op paddle-mobile) target_link_libraries(test-logical-xor-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-increment-op operators/test_increment_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-increment-op operators/test_increment_op.cpp test_helper.h test_include.h)
target_link_libraries(test-increment-op paddle-mobile) target_link_libraries(test-increment-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-is-empty-op operators/test_is_empty_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-is-empty-op operators/test_is_empty_op.cpp test_helper.h test_include.h)
target_link_libraries(test-is-empty-op paddle-mobile) target_link_libraries(test-is-empty-op paddle-mobile)
ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h)
target_link_libraries(test-conv-bn-relu-op paddle-mobile) target_link_libraries(test-conv-bn-relu-op paddle-mobile)
ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
target_link_libraries(test-dwconv-bn-relu-op paddle-mobile) target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h)
target_link_libraries(test-conv-gpu paddle-mobile) target_link_libraries(test-conv-gpu paddle-mobile)
ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
target_link_libraries(test-net-benchmark paddle-mobile) target_link_libraries(test-net-benchmark paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-net paddle-mobile) target_link_libraries(test-net paddle-mobile)
# gen test
ADD_EXECUTABLE(test-net-feeds net/test_net_multi_feed.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-net-feeds paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-net-performance paddle-mobile) target_link_libraries(test-net-performance paddle-mobile)
ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-inference-api-v2 paddle-mobile)
endif () endif ()
else() else ()
# gen test # gen test
ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-net paddle-mobile) target_link_libraries(test-net paddle-mobile)
ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
target_link_libraries(test-net-benchmark paddle-mobile) target_link_libraries(test-net-benchmark paddle-mobile)
endif()
ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-inference-api-v2 paddle-mobile)
endif ()
...@@ -15,10 +15,8 @@ limitations under the License. */ ...@@ -15,10 +15,8 @@ limitations under the License. */
#include "common/log.h" #include "common/log.h"
int main() { int main() {
DLOGF("DASJFDAFJ%d -- %f", 12345, 344.234); LOG(paddle_mobile::kLOG_DEBUG3) << "test debug"
<< " next log";
LOGF(paddle_mobile::kLOG_DEBUG, "DASJFDAFJ%d -- %f", 12345, 344.234);
LOG(paddle_mobile::kLOG_DEBUG) << "test debug" LOG(paddle_mobile::kLOG_DEBUG) << "test debug"
<< " next log"; << " next log";
...@@ -26,9 +24,12 @@ int main() { ...@@ -26,9 +24,12 @@ int main() {
<< " next log"; << " next log";
LOG(paddle_mobile::kLOG_DEBUG2) << "test debug2" LOG(paddle_mobile::kLOG_DEBUG2) << "test debug2"
<< " next log"; << " next log";
LOG(paddle_mobile::kLOG_INFO) << "INFO!!!";
LOG(paddle_mobile::kLOG_WARNING) << "WARNING!!!";
LOG(paddle_mobile::kLOG_VERBOSE) << "VERBOSE!!!";
DLOG << "test DLOG"; DLOG << "test DLOG";
LOG(paddle_mobile::kLOG_ERROR) << " error occur !"; LOG(paddle_mobile::kLOG_ERROR) << "ERROR !";
return 0; return 0;
} }
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "common/log.h" #include "common/log.h"
#include "framework/executor.h" #include "framework/executor.h"
#include "framework/op_registry.h" #include "framework/op_registry.h"
...@@ -74,8 +74,11 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -74,8 +74,11 @@ class Executor4Test : public Executor<DeviceType> {
break; break;
} }
} }
if (this->program_.combined) {
this->InitMemory(); this->InitCombineMemory();
} else {
this->InitMemory();
}
for (const auto &op : this->ops_of_block0_) { for (const auto &op : this->ops_of_block0_) {
op->Init(); op->Init();
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_MOBILE_CL
#include <memory>
#include <string>
#include <vector>
#include "./test_helper.h"
#include "common/log.h"
#include "framework/cl/cl_helper.h"
#include "framework/cl/cl_tensor.h"
#include "framework/executor.h"
#include "framework/op_registry.h"
#include "operators/feed_op.h"
#include "operators/fetch_op.h"
using paddle_mobile::framework::AttributeMap;
using paddle_mobile::framework::BlockDesc;
using paddle_mobile::framework::DDim;
using paddle_mobile::framework::Executor;
using paddle_mobile::framework::LoDTensor;
using paddle_mobile::framework::OpDesc;
using paddle_mobile::framework::OperatorBase;
using paddle_mobile::framework::Program;
using paddle_mobile::framework::Tensor;
using paddle_mobile::framework::Variable;
using std::string;
using std::vector;
namespace paddle_mobile {
template <typename OpType>
class OpenClOpTester {
public:
OpenClOpTester() {
framework::CLEngine::Instance()->setClPath("/data/local/tmp/bin");
scope_ = std::make_shared<paddle_mobile::framework::Scope>();
feed_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
fetch_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
this->feed_clhelper_.AddKernel("feed", "feed_kernel.cl");
this->fetch_clhelper_.AddKernel("fetch", "fetch_kernel.cl");
feed_var = scope_.get()->Var("feed");
fetch_var = scope_.get()->Var("fetch");
op_in_var = scope_.get()->Var("op_in");
op_out_var = scope_.get()->Var("op_out");
}
void Predict(string op_type, DDim feed_dims, DDim fetch_dims,
VariableNameMap inputs_feed, VariableNameMap outputs_feed,
AttributeMap attrs_feed) {
framework::CLImage *const op_in_cl_image =
op_in_var->template GetMutable<framework::CLImage>();
op_in_cl_image->Resize(feed_dims);
op_in_cl_image->InitEmptyImage(feed_clhelper_.CLContext(),
feed_clhelper_.CLCommandQueue(), feed_dims);
framework::CLImage *const op_out_cl_image =
op_out_var->template GetMutable<framework::CLImage>();
op_out_cl_image->Resize(fetch_dims);
framework::CLScope *const clScpoe = scope_->GetCLScpoe();
op_out_cl_image->InitEmptyImage(clScpoe->Context(), clScpoe->CommandQueue(),
fetch_dims);
Feed(feed_dims);
auto *op = new OpType(op_type, inputs_feed, outputs_feed, attrs_feed,
scope_.get());
op->InferShape();
op->Init();
op->Run();
Fetch(fetch_dims);
}
void Feed(DDim feed_dims) {
auto *feed_var = scope_->Var("feed");
auto *_var = scope_->Var("op_in");
auto *const input = feed_var->template GetMutable<framework::LoDTensor>();
DLOG << "feed_dims: " << feed_dims;
SetupTensor<float>(input, feed_dims, -100.0, 100.0);
framework::CLImage *const op_in_cl_image =
op_in_var->template GetMutable<framework::CLImage>();
DLOG << "FeedKernel run ";
DLOG << "params.input " << *input;
DLOG << "params.op_in_cl_image " << *op_in_cl_image;
auto kernel = this->feed_clhelper_.KernelAt(0);
DLOG << "kernel get success ";
auto default_work_size =
this->feed_clhelper_.DefaultWorkSize(*(op_in_cl_image));
DLOG << "op_in_cl_image: " << *op_in_cl_image;
DLOG << "default_work_size: " << default_work_size;
cl_int status;
int numel = input->numel();
cl_mem output_image = op_in_cl_image->GetCLImage();
const int out_C = op_in_cl_image->dims()[1];
const int out_H = op_in_cl_image->dims()[2];
const int out_W = op_in_cl_image->dims()[3];
const int Stride2 = out_C * out_H * out_W;
const int Stride1 = out_H * out_W;
const int Stride0 = out_W;
framework::CLTensor input_cl_tensor(this->feed_clhelper_.CLContext(),
this->feed_clhelper_.CLCommandQueue());
input_cl_tensor.Resize(input->dims());
cl_mem inputBuffer;
inputBuffer =
input_cl_tensor.mutable_with_data<float>(input->data<float>());
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->feed_clhelper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
DLOG << "*op_in_cl_image: " << *op_in_cl_image;
}
void Fetch(DDim fetch_dims) {
DLOG << "------------------ Fetch op ---------------------";
DLOG << "------------------ Fetch op end ---------------------";
}
private:
std::shared_ptr<paddle_mobile::framework::Scope> scope_;
framework::CLHelper feed_clhelper_;
framework::CLHelper fetch_clhelper_;
Variable *feed_var;
Variable *fetch_var;
Variable *op_in_var;
Variable *op_out_var;
};
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "io/paddle_inference_api.h"
using namespace paddle_mobile; // NOLINT
PaddleMobileConfig GetConfig() {
PaddleMobileConfig config;
config.precision = PaddleMobileConfig::FP32;
config.device = PaddleMobileConfig::kGPU_CL;
config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
config.prog_file = "../models/ercy/model";
config.param_file = "../models/ercy/params";
config.lod_mode = false;
config.load_when_predict = false;
return config;
}
int main() {
PaddleMobileConfig config = GetConfig();
auto predictor =
CreatePaddlePredictor<PaddleMobileConfig,
PaddleEngineKind::kPaddleMobile>(config);
// reliable
int re_len = 1 * 1 * 64 * 72;
std::vector<float> re_v;
std::vector<int64_t> re_dims{1, 1, 64, 72};
GetInput<float>(g_test_image_1x3x224x224, &re_v, re_dims);
PaddleTensor re;
re.shape = std::vector<int>({1, 1, 64, 72});
re.data = PaddleBuf(re_v.data(), re_len * sizeof(float));
re.dtype = PaddleDType::FLOAT32;
re.layout = LayoutType::LAYOUT_CHW;
// grid
int grid_len = 1 * 64 * 72 * 2;
std::vector<float> grid_v;
std::vector<int64_t> grid_dims{1, 64, 72, 2};
GetInput<float>(g_test_image_1x3x224x224, &grid_v, grid_dims);
PaddleTensor grid;
grid.shape = std::vector<int>({1, 64, 72, 2});
grid.data = PaddleBuf(grid_v.data(), grid_len * sizeof(float));
grid.dtype = PaddleDType::FLOAT32;
grid.layout = LayoutType::LAYOUT_CHW;
// last_input
int last_len = 1 * 128 * 64 * 72;
std::vector<float> last_v;
std::vector<int64_t> last_dims{1, 128, 64, 72};
GetInput<float>(g_test_image_1x3x224x224, &last_v, last_dims);
PaddleTensor last;
last.shape = std::vector<int>({1, 128, 64, 72});
last.data = PaddleBuf(last_v.data(), last_len * sizeof(float));
last.dtype = PaddleDType::FLOAT32;
last.layout = LayoutType::LAYOUT_CHW;
// input_rgb
int input_rgb_len = 1 * 4 * 256 * 288;
std::vector<float> input_rgb_v;
std::vector<int64_t> input_rgb_dims{1, 4, 256, 288};
GetInput<float>(g_test_image_1x3x224x224, &input_rgb_v, input_rgb_dims);
PaddleTensor input_rgb;
input_rgb.shape = std::vector<int>({1, 4, 256, 288});
input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float));
input_rgb.dtype = PaddleDType::FLOAT32;
input_rgb.layout = LayoutType::LAYOUT_CHW;
PaddleTensor output0;
output0.shape = std::vector<int>({});
output0.data = PaddleBuf();
output0.dtype = PaddleDType::FLOAT32;
output0.layout = LayoutType::LAYOUT_CHW;
PaddleTensor output1;
output1.shape = std::vector<int>({});
output1.data = PaddleBuf();
output1.dtype = PaddleDType::FLOAT32;
output1.layout = LayoutType::LAYOUT_CHW;
predictor->Feed("reliable", re);
predictor->Feed("grid", grid);
predictor->Feed("last_input", last);
predictor->Feed("input_rgb", input_rgb);
predictor->Run();
predictor->Fetch("save_infer_model/scale_0", &output0);
predictor->Fetch("save_infer_model/scale_1", &output1);
float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
float* out_ptr1 = reinterpret_cast<float*>(output1.data.data());
std::cout << " print output0 : " << std::endl;
int numel = output0.data.length() / sizeof(float);
int stride = numel / 20;
stride = stride > 0 ? stride : 1;
for (size_t j = 0; j < numel; j += stride) {
std::cout << out_ptr0[j] << " ";
}
std::cout << std::endl;
std::cout << " print output1 : " << std::endl;
numel = output1.data.length() / sizeof(float);
stride = numel / 20;
stride = stride > 0 ? stride : 1;
for (size_t j = 0; j < numel; j += stride) {
std::cout << out_ptr1[j] << " ";
}
std::cout << std::endl;
return 0;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#include <fstream>
#include <iostream>
#include <string>
#include "../test_helper.h"
#include "../test_include.h"
void test(int argc, char *argv[]);
void feed(PaddleMobile<paddle_mobile::GPU_CL> *paddle_mobile, const DDim &dims,
std::string feed_name) {
float *input_data_array = new float[product(dims)];
std::ifstream in(feed_name, std::ios::in);
for (int i = 0; i < product(dims); i++) {
float num;
in >> num;
input_data_array[i] = num;
}
in.close();
framework::Tensor input_tensor(input_data_array, dims);
DLOG << feed_name << " : " << input_tensor;
paddle_mobile->Feed(feed_name, input_tensor);
}
int main(int argc, char *argv[]) {
test(argc, argv);
return 0;
}
void test(int argc, char *argv[]) {
int arg_index = 1;
bool fuse = std::stoi(argv[arg_index]) == 1;
arg_index++;
bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
arg_index++;
bool quantification = std::stoi(argv[arg_index]) == 1;
arg_index++;
int quantification_fold = std::stoi(argv[arg_index]);
arg_index++;
paddle_mobile::PaddleMobileConfigInternal config;
config.memory_optimization_level = enable_memory_optimization
? MemoryOptimizationWithoutFeeds
: NoMemoryOptimization;
#ifdef PADDLE_MOBILE_CL
// config.load_when_predict = true;
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
paddle_mobile.SetCLPath("/data/local/tmp/bin");
std::cout << "testing opencl yyz " << std::endl;
#else
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
paddle_mobile.SetThreadNum(1);
std::cout << "testing cpu yyz " << std::endl;
#endif
int dim_count = std::stoi(argv[arg_index]);
arg_index++;
int size = 1;
arg_index += dim_count;
bool is_lod = std::stoi(argv[arg_index]) == 1;
arg_index++;
paddle_mobile::framework::LoD lod{{}};
if (is_lod) {
int lod_count = std::stoi(argv[arg_index]);
arg_index++;
for (int i = 0; i < lod_count; i++) {
int dim = std::stoi(argv[arg_index + i]);
lod[0].push_back(dim);
}
arg_index += lod_count;
}
int var_count = std::stoi(argv[arg_index]);
arg_index++;
bool is_sample_step = std::stoi(argv[arg_index]) == 1;
arg_index++;
int sample_arg = std::stoi(argv[arg_index]);
int sample_step = sample_arg;
int sample_num = sample_arg;
arg_index++;
std::vector<std::string> var_names;
for (int i = 0; i < var_count; i++) {
std::string var_name = argv[arg_index + i];
var_names.push_back(var_name);
}
arg_index += var_count;
bool check_shape = std::stoi(argv[arg_index]) == 1;
arg_index++;
auto time1 = time();
if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
fuse, quantification, 1, is_lod,
quantification_fold)) {
auto time2 = time();
std::cout << "auto-test"
<< " load-time-cost :" << time_diff(time1, time2) << "ms"
<< std::endl;
feed(&paddle_mobile, {1, 4, 256, 288}, "input_rgb");
feed(&paddle_mobile, {1, 128, 64, 72}, "last_input");
feed(&paddle_mobile, {1, 64, 72, 2}, "grid");
feed(&paddle_mobile, {1, 1, 64, 72}, "reliable");
paddle_mobile.Predict();
#ifdef PADDLE_MOBILE_CL
for (auto var_name : var_names) {
auto cl_image = paddle_mobile.FetchImage(var_name);
if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
continue;
}
auto len = cl_image->numel();
if (len == 0) {
continue;
}
size_t width = cl_image->ImageDims()[0];
size_t height = cl_image->ImageDims()[1];
paddle_mobile::framework::half_t *image_data =
new paddle_mobile::framework::half_t[height * width * 4];
cl_int err;
cl_mem image = cl_image->GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, image_data, 0, NULL, NULL);
CL_CHECK_ERRORS(err);
float *tensor_data = new float[cl_image->numel()];
auto converter = cl_image->Converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(),
cl_image->dims());
auto data = tensor_data;
std::string sample = "";
if (check_shape) {
for (int i = 0; i < cl_image->dims().size(); i++) {
sample += " " + std::to_string(cl_image->dims()[i]);
}
}
if (!is_sample_step) {
sample_step = len / sample_num;
}
if (sample_step <= 0) {
sample_step = 1;
}
for (int i = 0; i < len; i += sample_step) {
sample += " " + std::to_string(data[i]);
}
std::cout << "auto-test"
<< " var " << var_name << sample << std::endl;
}
#else
for (auto var_name : var_names) {
auto out = paddle_mobile.Fetch(var_name);
auto len = out->numel();
if (len == 0) {
continue;
}
if (out->memory_size() == 0) {
continue;
}
if (out->type() == type_id<int>()) {
auto data = out->data<int>();
std::string sample = "";
if (check_shape) {
for (int i = 0; i < out->dims().size(); i++) {
sample += " " + std::to_string(out->dims()[i]);
}
}
if (!is_sample_step) {
sample_step = len / sample_num;
}
if (sample_step <= 0) {
sample_step = 1;
}
for (int i = 0; i < len; i += sample_step) {
sample += " " + std::to_string(data[i]);
}
std::cout << "auto-test"
<< " var " << var_name << sample << std::endl;
} else if (out->type() == type_id<float>()) {
auto data = out->data<float>();
std::string sample = "";
if (check_shape) {
for (int i = 0; i < out->dims().size(); i++) {
sample += " " + std::to_string(out->dims()[i]);
}
}
if (!is_sample_step) {
sample_step = len / sample_num;
}
if (sample_step <= 0) {
sample_step = 1;
}
for (int i = 0; i < len; i += sample_step) {
sample += " " + std::to_string(data[i]);
}
std::cout << "auto-test"
<< " var " << var_name << sample << std::endl;
}
}
#endif
std::cout << std::endl;
}
}
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#include "../executor_for_test_opencl.h"
#include "operators/expand_op.h"
#include "operators/feed_op.h"
#ifdef EXPAND_OP
int main() {
const int IN_N = 1;
const int IN_C = 1;
const int IN_H = 2;
const int IN_W = 3;
const int EXPEND_N = 1;
const int EXPEND_C = 1;
const int EXPEND_H = 2;
const int EXPEND_W = 2;
const int OUT_N = IN_N * EXPEND_N;
const int OUT_C = IN_C * EXPEND_C;
const int OUT_H = IN_H * EXPEND_H;
const int OUT_W = IN_W * EXPEND_W;
framework::DDim in_dims = framework::make_ddim({IN_N, IN_C, IN_H, IN_W});
framework::DDim out_dims = framework::make_ddim({OUT_N, OUT_C, OUT_H, OUT_W});
VariableNameMap inputs;
VariableNameMap outputs;
AttributeMap attrs;
inputs["X"] = std::vector<std::string>({"op_in"});
outputs["Out"] = std::vector<std::string>({"op_out"});
std::vector<int> expand_times = {EXPEND_N, EXPEND_C, EXPEND_H, EXPEND_W};
attrs["expand_times"].Set<std::vector<int>>(expand_times);
OpenClOpTester<operators::ExpandOp<GPU_CL, float>> tester;
tester.Predict("expend", in_dims, out_dims, inputs, outputs, attrs);
}
#endif
#else
int main() {}
#endif
...@@ -273,8 +273,9 @@ endif() ...@@ -273,8 +273,9 @@ endif()
list(FIND NET "op" CON) list(FIND NET "op" CON)
if (CON GREATER -1) if (CON GREATER -1)
message("op enabled") message("op enabled")
set(SIGMOID_OP ON) # set(SIGMOID_OP ON)
set(LEAKY_RELU_OP ON) # set(LEAKY_RELU_OP ON)
set(BLOG ON)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif() endif()
...@@ -379,6 +380,8 @@ if(NOT FOUND_MATCH) ...@@ -379,6 +380,8 @@ if(NOT FOUND_MATCH)
set(REDUCE_PROD_OP ON) set(REDUCE_PROD_OP ON)
set(FUSION_INSTANCENORM_RELU_OP ON) set(FUSION_INSTANCENORM_RELU_OP ON)
set(PIXEL_SHUFFLE_OP ON) set(PIXEL_SHUFFLE_OP ON)
set(EXPAND_OP ON)
set(GRID_SAMPLER_OP ON)
endif() endif()
# option(BATCHNORM_OP "" ON) # option(BATCHNORM_OP "" ON)
...@@ -755,3 +758,13 @@ endif() ...@@ -755,3 +758,13 @@ endif()
if (PIXEL_SHUFFLE_OP) if (PIXEL_SHUFFLE_OP)
add_definitions(-DPIXEL_SHUFFLE_OP) add_definitions(-DPIXEL_SHUFFLE_OP)
endif() endif()
if (EXPAND_OP)
add_definitions(-DEXPAND_OP)
endif()
if (GRID_SAMPLER_OP)
add_definitions(-DGRID_SAMPLER_OP)
endif()
if (BLOG)
add_definitions(-DBLOG)
endif()
...@@ -3,3 +3,4 @@ ...@@ -3,3 +3,4 @@
!.gitignore !.gitignore
!/model-encrypt-tool !/model-encrypt-tool
!test_wrap.py !test_wrap.py
!run_multi_feed.py
# -*- coding: utf-8 -*
import os
import sys
import math
import subprocess
import numpy as np
import paddle.fluid as fluid
model_path = "erciyuan"
checked_model_path = "checked_model"
feed_path = "feeds"
output_path = "outputs"
diff_threshold = 0.1
is_lod = False
mobile_model_path = ""
fast_check = False
is_sample_step = False
sample_step = 1
sample_num = 20
need_encrypt = False
checked_encrypt_model_path = "checked_encrypt_model"
output_var_filter = []
output_key_filter = {}
check_shape = False
quantification = False
quantification_fold = 1000
architecture = "arm-v7a"
# architecture = "arm-v8a"
correct_persistable = False
np.set_printoptions(linewidth=150)
mobile_exec_root = "/data/local/tmp/bin"
mobile_src_root = os.path.abspath("../../../")
if mobile_src_root.endswith("/"):
mobile_src_root = mobile_src_root[:-1]
dot = "•"
black = lambda x: "\033[30m" + str(x) + "\033[0m"
red = lambda x: "\033[31m" + str(x) + "\033[0m"
green = lambda x: "\033[32m" + str(x) + "\033[0m"
yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
reset = lambda x: "\033[0m" + str(x)
feed_names_ = []
def pp_tab(x, level=0):
header = ""
for i in range(0, level):
header += "\t"
print(header + str(x))
def pp_black(x, level=0):
pp_tab(black(x) + reset(""), level)
def pp_red(x, level=0):
pp_tab(red(x) + reset(""), level)
def pp_green(x, level=0):
pp_tab(green(x) + reset(""), level)
def pp_yellow(x, level=0):
pp_tab(yellow(x) + reset(""), level)
def sh(command):
pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return pipe.stdout.read().decode("utf-8")
def push(src, dest=""):
sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
pp_yellow(dot + " start inspecting fluid model")
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
# 加载模型
def load_model(model_path):
prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
global correct_persistable
if correct_persistable:
ops = prog.current_block().ops
vars = prog.current_block().vars
for op in ops:
for var_name in op.output_arg_names:
if var_name == "fetch":
continue
var = vars[var_name]
if var.persistable:
pp_red("has found non-persistable output var : {}".format(var_name))
var.persistable = False
return (prog, feeds, fetches)
prog, feeds, fetches = load_model(model_path)
# 强制要求所有张量的形状,在model和params中一致,并重新保存模型
def resave_model(feed_kv):
if len(mobile_model_path) > 0:
pp_green("has set mobile_model_path, stop checking model & params", 1)
sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
return
ops = prog.current_block().ops
vars = prog.current_block().vars
# 强制所有var为可持久化
p_names = []
for name in vars:
name = str(name)
v = fluid.framework._get_var(name, prog)
if not v.persistable:
v.persistable = True
p_names.append(name)
outputs = run_model(feed_kv=feed_kv)
has_found_wrong_shape = False
# 修正每个var的形状
for name in vars:
name = str(name)
v = vars[name]
if v.persistable:
v1 = fluid.global_scope().find_var(name)
try:
t1 = v1.get_tensor()
shape = t1.shape()
except:
continue
if v.desc.shape() != shape:
has_found_wrong_shape = True
v.desc.set_shape(shape)
# 恢复var的可持久化属性
for name in p_names:
v = fluid.framework._get_var(name, prog)
v.persistable = False
if not quantification:
fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
if has_found_wrong_shape:
pp_red("has found wrong shape", 1)
else:
pp_green("has not found wrong shape", 1)
pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
# 分别加密model和params,加密key使用同一个
def encrypt_model():
if not need_encrypt:
return
pp_yellow(dot + dot + " encrypting model")
if not os.path.exists(checked_encrypt_model_path):
os.mkdir(checked_encrypt_model_path)
res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
lines = res.split("\n")
for line in lines:
if line.startswith("key:"):
line = line.replace('key:','')
sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
"checked_model/model.ml".format(line))
sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params -o checked_model/params.ml".format(line))
pp_green("model has been encrypted, key is : {}".format(line), 1)
sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
return
pp_red("model encrypt error", 1)
# 生成feed的key-value对
def gen_feed_kv():
feed_kv = {}
for feed_name in feeds:
feed_shape = get_feed_var_shape(feed_name)
data = np.random.random(feed_shape).astype("float32")
feed_kv[feed_name] = data
return feed_kv
# 保存feed的key-value对
def save_feed_kv(feed_kv):
for feed_name in feed_kv:
feed_data = feed_kv[feed_name]
feed_list = feed_data.flatten().tolist()
if not os.path.exists(feed_path):
os.mkdir(feed_path)
file_name = feed_name.replace("/", "_")
out_file = open(feed_path + "/" + file_name, "w")
for feed_item in feed_list:
out_file.write("{}\n".format(feed_item))
out_file.close()
last_feed_var_name = None
last_feed_file_name = None
last_feed_var_lod = None
# 加载feed的key-value对
def load_feed_kv():
if not os.path.exists(feed_path):
return None
global last_feed_var_name
global last_feed_file_name
global last_feed_var_lod
feed_kv = {}
pp_yellow(dot + dot + " checking feed info")
pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
for feed_name in feeds:
feed_shape = get_feed_var_shape(feed_name)
pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
file_name = feed_name.replace("/", "_")
last_feed_var_name = feed_name
last_feed_file_name = file_name
feed_file_path = feed_path + "/" + file_name
if not os.path.exists(feed_file_path):
return None
data = np.loadtxt(feed_file_path)
expected_len = 1
for dim in feed_shape:
expected_len *= dim
if len(np.atleast_1d(data)) != expected_len:
return None
data = data.reshape(feed_shape).astype("float32")
if is_lod:
data_shape = [1]
for dim in feed_shape:
data_shape.append(dim)
data = data.reshape(data_shape).astype("float32")
tensor = fluid.LoDTensor()
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
data = data.reshape(feed_shape)
tensor.set(data, fluid.CPUPlace())
tensor.set_lod([lod])
last_feed_var_lod = lod
feed_kv[feed_name] = tensor
else:
feed_kv[feed_name] = data
return feed_kv
# 运行模型
def run_model(feed_kv=None):
pp_yellow("run_model", 1)
if feed_kv is None:
feed_kv = gen_feed_kv()
feed_names_.clear()
for feed_name in feeds:
feed_names_.append(feed_name)
pp_green(feed_name, 1)
pp_green(feed_names_, 1)
outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
results = []
for output in outputs:
results.append(np.array(output))
return results
# 获取变量形状
def get_var_shape(var_name):
vars = prog.current_block().vars
shape = vars[var_name].desc.shape()
for i in range(len(shape)):
dim = shape[i]
if dim == -1:
shape[i] = 1
return shape
# 获取输入变量形状
def get_feed_var_shape(var_name):
# 如果想写死输入形状,放开以下语句
# return [1, 3, 224, 224]
return get_var_shape(var_name)
persistable_cache = []
# 所有var,全部变成持久化
def force_all_vars_to_persistable():
global persistable_cache
for var_name in vars.keys():
var_name = str(var_name)
v = fluid.framework._get_var(var_name, prog)
persistable = v.persistable
if not persistable:
persistable_cache.append(var_name)
v.persistable = True
# 恢复持久化属性
def restore_all_vars_persistable():
global persistable_cache
for var_name in vars.keys():
var_name = str(var_name)
v = fluid.framework._get_var(var_name, prog)
persistable = v.persistable
if var_name in persistable_cache:
v.persistable = False
persistable_cache = []
# 获取var的数据
def get_var_data(var_name, feed_kv=None):
output = np.array(fluid.global_scope().var(var_name).get_tensor())
return output
output_var_cache = {}
def tensor_sample(tensor):
if is_sample_step:
step = sample_step
else:
step = math.floor(len(tensor) / sample_num)
step = max(step, 1)
step = int(step)
sample = []
for i in range(0, len(tensor), step):
sample.append(tensor[i])
return sample
op_cache = {}
# 获取每层输出的数据
def save_all_op_output(feed_kv=None):
force_all_vars_to_persistable()
outputs = run_model(feed_kv=feed_kv)
if not os.path.exists(output_path):
os.mkdir(output_path)
ops = prog.current_block().ops
fetch_names = []
for fetch in fetches:
fetch_names.append(fetch.name)
feed_names = feeds
if len(output_var_filter) > 0:
for fetch_name in fetch_names:
output_var_filter.append(fetch_name)
for i in range(len(ops)):
op = ops[i]
var_name = None
var_name_index = -1
for index in range(len(op.output_names)):
if op.output_names[index] in ["Y", "Out", "Output"]:
var_name_index = index
break
if var_name_index != -1:
var_name = op.output_arg_names[var_name_index]
else:
for name in op.output_arg_names:
var_name = name
if "tmp" in name:
break
if len(output_var_filter) > 0:
if var_name not in output_var_filter:
continue
# real_var_name = None
# if op.type == "fetch":
# for name in op.input_arg_names:
# real_var_name = name
# if "tmp" in name:
# break
# else:
# real_var_name = var_name
if fast_check:
if var_name not in fetch_names and var_name not in feed_names:
continue
try:
data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
sample = tensor_sample(data)
output_var_cache[var_name] = (sample)
op_cache[i] = (var_name, op)
file_name = var_name.replace("/", "_")
out_file = open(output_path + "/" + file_name, "w")
if var_name in feed_names:
for item in data:
out_file.write("{}\n".format(item))
else:
for item in sample:
out_file.write("{}\n".format(item))
out_file.close()
except:
pass
for i in range(len(ops)):
op = ops[i]
if op.type not in output_key_filter:
continue
var_name = None
var_name_index = -1
for index in range(len(op.output_names)):
if op.output_names[index] in output_key_filter[op.type]:
var_name_index = index
break
if var_name_index != -1:
var_name = op.output_arg_names[var_name_index]
else:
continue
if len(output_var_filter) > 0:
if var_name not in output_var_filter:
continue
# real_var_name = None
# if op.type == "fetch":
# for name in op.input_arg_names:
# real_var_name = name
# if "tmp" in name:
# break
# else:
# real_var_name = var_name
if fast_check:
if var_name not in fetch_names and var_name not in feed_names:
continue
try:
data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
sample = tensor_sample(data)
output_var_cache[var_name] = (sample)
op_cache[i] = (var_name, op)
file_name = var_name.replace("/", "_")
out_file = open(output_path + "/" + file_name, "w")
if var_name in feed_names:
for item in data:
out_file.write("{}\n".format(item))
else:
for item in sample:
out_file.write("{}\n".format(item))
out_file.close()
except:
pass
pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
restore_all_vars_persistable()
ops = prog.current_block().ops
vars = prog.current_block().vars
pp_yellow(dot + dot + " checking op list")
op_types = set()
for op in ops:
op_types.add(op.type)
pp_tab("op types : {}".format(op_types), 1)
def check_mobile_results(args, fuse, mem_opt):
args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
pp_green(args, 1)
res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net-feeds {}\"".format(mobile_exec_root, args))
lines = res.split("\n")
for line in lines:
print(line)
# for line in lines:
# if line.startswith("auto-test-debug"):
# print(line)
pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
mobile_var_cache = {}
for line in lines:
parts = line.split(" ")
if len(parts) < 2:
continue
if "auto-test" != parts[0]:
continue
if parts[1] == "load-time-cost":
pp_green("load time cost : {}".format(parts[2]), 1)
elif parts[1] == "predict-time-cost":
pp_green("predict time cost : {}".format(parts[2]), 1)
elif parts[1] == "preprocess-time-cost":
pp_green("preprocess time cost : {}".format(parts[2]), 1)
elif parts[1] == "var":
var_name = parts[2]
values = list(map(lambda x: float(x), parts[3:]))
mobile_var_cache[var_name] = values
error_index = None
error_values1 = None
error_values2 = None
checked_names = []
fetch_names = []
for fetch in fetches:
fetch_names.append(fetch.name)
fetch_diff = 0.0
fetch_count = 0
for index in op_cache:
op_output_var_name, op = op_cache[index]
if not op_output_var_name in output_var_cache:
continue
if not op_output_var_name in mobile_var_cache:
continue
if op_output_var_name not in fetch_names:
continue
values1 = output_var_cache[op_output_var_name]
values2 = mobile_var_cache[op_output_var_name]
shape = get_var_shape(op_output_var_name) if check_shape else []
for i in range(len(values1)):
v1 = values1[i]
v2 = values2[len(shape) + i]
fetch_diff += abs(v1 - v2)
fetch_count += 1
if fetch_count != 0:
pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
for index in op_cache:
op_output_var_name, op = op_cache[index]
if mem_opt:
found_in_fetch = False
for fetch in fetches:
if op_output_var_name == fetch.name:
found_in_fetch = True
break
if not found_in_fetch:
continue
if not op_output_var_name in output_var_cache:
continue
if not op_output_var_name in mobile_var_cache:
continue
if op_output_var_name not in fetch_names:
continue
values1 = output_var_cache[op_output_var_name]
values2 = mobile_var_cache[op_output_var_name]
shape = get_var_shape(op_output_var_name) if check_shape else []
if len(values1) + len(shape) != len(values2):
error_index = index
for i in range(len(shape)):
v1 = shape[i]
v2 = values2[i]
if v1 != v2:
error_index = index
break
if error_index == None:
for i in range(len(values1)):
v1 = values1[i]
v2 = values2[len(shape) + i]
if abs(v1 - v2) > diff_threshold:
error_index = index
break
checked_names.append(op_output_var_name)
if error_index != None:
error_values1 = values1
error_values2 = values2
break
if error_index == None:
for name in fetch_names:
if name not in checked_names:
error_index = -1
break
if error_index == None:
pp_green("outputs are all correct", 1)
elif error_index == -1:
pp_red("outputs are missing")
else:
error_values1 = np.array(error_values1)
error_values2 = np.array(error_values2)
# pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
pp_red("outputs are incorrect", 1)
pp_red("fluid results are : ", 1)
pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
pp_yellow("paddle mobile results are : ", 1)
pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
if not fuse and not mem_opt:
pp_yellow("checking individual ops : ", 1)
error_index = None
error_values1 = None
error_values2 = None
checked_names = []
fetch_names = []
for fetch in fetches:
fetch_names.append(fetch.name)
for index in op_cache:
op_output_var_name, op = op_cache[index]
if mem_opt:
found_in_fetch = False
for fetch in fetches:
if op_output_var_name == fetch.name:
found_in_fetch = True
break
if not found_in_fetch:
continue
if not op_output_var_name in output_var_cache:
continue
if not op_output_var_name in mobile_var_cache:
continue
if fuse or mem_opt:
if op_output_var_name not in fetch_names:
continue
values1 = output_var_cache[op_output_var_name]
values2 = mobile_var_cache[op_output_var_name]
shape = get_var_shape(op_output_var_name) if check_shape else []
if len(values1) + len(shape) != len(values2):
error_index = index
for i in range(len(shape)):
v1 = shape[i]
v2 = values2[i]
if v1 != v2:
error_index = index
break
if error_index == None:
for i in range(len(values1)):
v1 = values1[i]
v2 = values2[len(shape) + i]
if ((not math.isnan(v1)) and math.isnan(v2)) or abs(v1 - v2) > diff_threshold:
error_index = index
break
checked_names.append(op_output_var_name)
if error_index != None:
error_values1 = values1
error_values2 = values2
break
if error_index == None:
for name in fetch_names:
if name not in checked_names:
error_index = -1
break
if error_index == None:
pp_green("outputs are all correct", 1)
elif error_index == -1:
pp_red("outputs are missing")
else:
error_values1 = np.array(error_values1)
error_values2 = np.array(error_values2)
# pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
error_index,op_cache[error_index][1].type,op_output_var_name), 1)
pp_red("fluid results are : ", 1)
pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
pp_yellow("paddle mobile results are : ", 1)
pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
# print(output_var_cache)
# print(mobile_var_cache)
def main():
# 加载kv
feed_kv = load_feed_kv()
if feed_kv == None:
feed_kv = gen_feed_kv()
save_feed_kv(feed_kv)
feed_kv = load_feed_kv()
# 预测
pp_yellow(dot + dot + " checking inference")
outputs = run_model(feed_kv=feed_kv)
pp_tab("fluid output : {}".format(outputs), 1)
# 重新保存模型
pp_yellow(dot + dot + " checking model correctness")
resave_model(feed_kv=feed_kv)
# 输出加密模型
encrypt_model()
# 输出所有中间结果
pp_yellow(dot + dot + " checking output result of every op")
save_all_op_output(feed_kv=feed_kv)
pp_yellow(dot + dot + " checking fetch info")
for fetch in fetches:
fetch_name = fetch.name
fetch_shape = get_var_shape(fetch_name)
pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
# 输出所有op、var信息
info_file = open("info.txt", "w")
for i in range(len(ops)):
op = ops[i]
info_file.write("{}th op: type - {}\n".format(i, op.type))
info_file.write("inputs:\n")
for var_name in op.input_arg_names:
try:
shape = get_var_shape(var_name)
shape_str = ", ".join(list(map(lambda x: str(x), shape)))
info_file.write("var {} : {}\n".format(var_name, shape_str))
except:
pass
info_file.write("outputs:\n")
for var_name in op.output_arg_names:
try:
shape = get_var_shape(var_name)
shape_str = ", ".join(list(map(lambda x: str(x), shape)))
info_file.write("var {} : {}\n".format(var_name, shape_str))
except:
pass
info_file.close()
# 开始检查mobile的正确性
print("")
print("==================================================")
print("")
pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
push(checked_model_path)
pp_green(feed_names_, 1)
feed_names_argu = ""
for n in feed_names_:
feed_names_argu += "{}\n".format(n)
pp_green("feed name - {} ".format(str(n)), 1)
push(feed_path + "/" + str(n), "{}".format(str(n)))
push(feed_path + "/" + last_feed_file_name, "input.txt")
push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
push(mobile_src_root + "/test/build/test-net")
last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
args = str(len(last_feed_var_shape))
for dim in last_feed_var_shape:
args += " " + str(dim)
if is_lod:
args += " 1"
args += " " + str(len(last_feed_var_lod))
for dim in last_feed_var_lod:
args += " " + str(dim)
else:
args += " 0"
args += " " + str(len(output_var_cache))
args += " " + str(1 if is_sample_step else 0)
if is_sample_step:
args += " " + str(sample_step)
else:
args += " " + str(sample_num)
for var_name in output_var_cache.keys():
args += " " + var_name
args += " " + str(1 if check_shape else 0)
if not fast_check:
check_mobile_results(args, False, False)
check_mobile_results(args, False, True)
check_mobile_results(args, True, False)
check_mobile_results(args, True, True)
if __name__ == "__main__":
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册