未验证 提交 31150e99 编写于 作者: H HappyAngel 提交者: GitHub

[arm]add 2x2s2p1 pooling (#3705)

* fix pooling bug and speed

* add 2x2s2p1 pooling. test=develop

* fix conflict, test=develop
上级 49a60805
......@@ -206,6 +206,20 @@ void pooling_basic(const float* din,
"ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
"ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/
#define P2x2S2P1_MAX \
"ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \
"ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \
"sub %[dr0], %[dr0], #4\n" /* sub */ \
"sub %[dr1], %[dr1], #4\n" /* sub */ \
"fmax v4.4s, v0.4s, v6.4s\n" /* max */ \
"fmax v5.4s, v2.4s, v8.4s\n" /* max */ \
"ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
"ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \
"fmax v6.4s, v4.4s, v5.4s\n" /* max reduce */ \
"subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \
"st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"ble 2f\n" /* bne s3_max_loop_mid */
#define P2x2S2P0_MAX \
"1: \n" \
"fmax v4.4s, v0.4s, v1.4s\n" /* max */ \
......@@ -217,6 +231,21 @@ void pooling_basic(const float* din,
"st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"bne 1b\n" /* bne s3_max_loop_mid */
#define P2x2S2P1_AVG \
"ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \
"ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \
"sub %[dr0], %[dr0], #4\n" /* sub */ \
"sub %[dr1], %[dr1], #4\n" /* sub */ \
"fadd v4.4s, v0.4s, v6.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
"fadd v5.4s, v2.4s, v8.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
"ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
"ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \
"fadd v6.4s, v4.4s, v5.4s\n" /* add reduce */ \
"subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \
"fmul v4.4s, v6.4s, %[vcoef_left].4s\n" /* mul coef */ \
"st1 {v4.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"ble 2f\n" /* bne s3_max_loop_mid */
#define P2x2S2P0_AVG \
"1: \n" /* load bias to q2, q3*/ \
"fadd v4.4s, v0.4s, v1.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
......@@ -228,6 +257,7 @@ void pooling_basic(const float* din,
"fmul v4.4s, v6.4s, %[vcoef].4s\n" /* mul coef */ \
"st1 {v4.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"bne 1b\n" /* bne s3_max_loop_mid */
#define P3x3S1_INIT \
"ldr q0, [%[dr0]], #16\n" /* load q0, dr0, 0-3*/ \
"ldr q1, [%[dr1]], #16\n" /* load q1, dr1, 0-3*/ \
......@@ -518,16 +548,45 @@ void pooling_basic(const float* din,
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n"
#define P2x2S2P1_MAX \
"vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \
"vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \
"sub %[dr0], #4 @sub \n" \
"sub %[dr1], #4 @sub \n" \
"vmax.f32 q8, q0, q4 @ max \n" \
"vmax.f32 q9, q2, q5 @ max \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \
"vmax.f32 q5, q9, q8 @ max reduce\n" \
"subs %[cnt_num], #1 @ subs cnt_num \n" \
"vst1.f32 {d10-d11}, [%[dr_out]]! @ store 4 out \n" \
"ble 2f @ bne \n"
#define P2x2S2P0_MAX \
"1: @ main loop\n" \
"vmax.f32 q4, q0, q1 @ max \n" \
"vmax.f32 q5, q2, q3 @ max \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \
"vmax.f32 q6, q4, q5 @ max reduce\n" \
"vmax.f32 q8, q4, q5 @ max reduce\n" \
"subs %[cnt_num], #1 @ subs cnt_num \n" \
"vst1.f32 {d12-d13}, [%[dr_out]]! @ store 4 out \n" \
"bne 1b @ bne "
"vst1.f32 {d16-d17}, [%[dr_out]]! @ store 4 out \n" \
"bne 1b @ bne \n"
#define P2x2S2P1_AVG \
"vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \
"vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \
"sub %[dr0], #4 @sub \n" \
"sub %[dr1], #4 @sub \n" \
"vadd.f32 q9, q0, q4 @ max \n" \
"vadd.f32 q8, q2, q5 @ max \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \
"vadd.f32 q5, q9, q8 @ max reduce\n" \
"subs %[cnt_num], #1 @ subs cnt_num \n" \
"vmul.f32 q4, q5, %q[vcoef_left] @ mul coef \n" \
"vst1.f32 {d8-d9}, [%[dr_out]]! @ store 4 out \n" \
"ble 2f @ bne\n"
#define P2x2S2P0_AVG \
"1: @ main loop\n" \
......@@ -535,9 +594,9 @@ void pooling_basic(const float* din,
"vadd.f32 q5, q2, q3 @ add 0, 2, 4, 6 \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load d0-d3 \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load d4-d7 \n" \
"vadd.f32 q6, q4, q5 @ add reduce \n" \
"vadd.f32 q8, q4, q5 @ add reduce \n" \
"subs %[cnt_num], #1 @ subs \n" \
"vmul.f32 q4, q6, %q[vcoef] @ mul coef \n" \
"vmul.f32 q4, q8, %q[vcoef] @ mul coef \n" \
"vst1.f32 {d8-d9}, [%[dr_out]]! @ store 4 out \n" \
"bne 1b @ bne \n"
......@@ -1037,17 +1096,17 @@ void pooling1x1s2p0_max(const float* din,
TargetFree(TARGET(kARM), write_ptr);
}
void pooling2x2s2_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
int pad_bottom,
int pad_right) {
void pooling2x2s2p0_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
int pad_bottom,
int pad_right) {
int size_channel_out = wout * hout;
int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout);
......@@ -1095,7 +1154,7 @@ void pooling2x2s2_max(const float* din,
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
#endif
dr0 -= 8;
dr1 -= 8;
......@@ -1121,18 +1180,18 @@ void pooling2x2s2_max(const float* din,
}
}
void pooling2x2s2_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive,
int pad_bottom,
int pad_right) {
void pooling2x2s2p0_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive,
int pad_bottom,
int pad_right) {
int size_channel_out = wout * hout;
int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout);
......@@ -1158,12 +1217,14 @@ void pooling2x2s2_avg(const float* din,
const float* data_in_channel = data_in_batch + c * size_channel_in;
const float* r0 = data_in_channel;
const float* r1 = r0 + win;
vcoef = vdupq_n_f32(0.25f);
for (int h = 0; h < hout; h++) {
float* dr_out = data_out_channel;
auto dr0 = r0;
auto dr1 = r1;
if (h * S + K - P > hin) {
dr1 = zero_ptr;
vcoef = vdupq_n_f32(0.5f);
}
int cnt_num = w_unroll_size;
if (w_unroll_size > 0) {
......@@ -1184,7 +1245,7 @@ void pooling2x2s2_avg(const float* din,
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vcoef] "w"(vcoef)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
#endif
dr0 -= 8;
dr1 -= 8;
......@@ -1194,8 +1255,14 @@ void pooling2x2s2_avg(const float* din,
int wstart = 0;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, rem);
float coef = 0.5f / (wend - wstart);
float coef = 0.25f;
float tmp = 0.f;
if (wend - wstart == 1 && pad_right == 0) {
coef *= 2;
}
if (h * S + K - P > hin && pad_bottom == 0) {
coef *= 2;
}
for (int i = wstart; i < wend; i++) {
tmp += dr0[i] + dr1[i];
}
......@@ -1212,6 +1279,235 @@ void pooling2x2s2_avg(const float* din,
TargetFree(TARGET(kARM), zero_ptr);
}
void pooling2x2s2p1_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
int pad_bottom,
int pad_right) {
int size_channel_out = wout * hout;
int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout);
auto data_in = static_cast<const float*>(din);
const int K = 2;
const int P = 1;
const int S = 2;
int w_unroll_size = wout / 4;
int w_unroll_remian = wout - w_unroll_size * 4;
float32x4_t vzero = vdupq_n_f32(std::numeric_limits<float>::lowest());
if (w_unroll_remian == 0) {
w_unroll_size -= 1;
w_unroll_remian = wout - w_unroll_size * 4;
}
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in;
#pragma omp parallel for
for (int c = 0; c < chout; c++) {
float* data_out_channel = data_out_batch + c * size_channel_out;
const float* data_in_channel = data_in_batch + c * size_channel_in;
const float* r0 = data_in_channel;
const float* r1 = r0 + win;
for (int h = 0; h < hout; h++) {
float* dr_out = data_out_channel;
auto dr0 = r0;
auto dr1 = r1;
if (h == 0) {
dr0 = r0;
dr1 = r0;
r0 = r1;
r1 = r0 + win;
} else {
r0 = r1 + win;
r1 = r0 + win;
}
if (h * S + K - P > hin) {
dr1 = dr0;
if (h * S + K - P > hin + 1) {
memset(dr_out, 0, wout * sizeof(float));
continue;
}
}
int cnt_num = w_unroll_size;
if (w_unroll_size > 0) {
#ifdef __aarch64__
asm volatile(
P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vzero] "w"(vzero)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
#else
asm volatile(
P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vzero] "w"(vzero)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
#endif
dr0 -= 8;
dr1 -= 8;
}
// deal with right pad
int wstart = w_unroll_size * 4 * S - P;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, win);
int st = wstart > 0 ? wstart : 0;
float tmp = wend == st ? 0.f : dr0[0];
for (int i = 0; i < wend - st; i++) {
tmp = std::max(tmp, dr0[i]);
tmp = std::max(tmp, dr1[i]);
}
*(dr_out++) = tmp;
dr0 += S - (st - wstart);
dr1 += S - (st - wstart);
wstart += S;
}
data_out_channel += wout;
}
}
}
}
void pooling2x2s2p1_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive,
int pad_bottom,
int pad_right) {
int size_channel_out = wout * hout;
int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout);
auto data_in = static_cast<const float*>(din);
const int K = 2;
const int P = 1;
const int S = 2;
int w_unroll_size = wout / 4;
int w_unroll_remian = wout - w_unroll_size * 4;
auto zero_ptr =
static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
float32x4_t vzero = vdupq_n_f32(0.f);
memset(zero_ptr, 0, win * sizeof(float));
if (w_unroll_remian == 0) {
w_unroll_size -= 1;
w_unroll_remian = wout - w_unroll_size * 4;
}
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in;
#pragma omp parallel for
for (int c = 0; c < chout; c++) {
float* data_out_channel = data_out_batch + c * size_channel_out;
const float* data_in_channel = data_in_batch + c * size_channel_in;
const float* r0 = data_in_channel;
const float* r1 = r0 + win;
for (int h = 0; h < hout; h++) {
float* dr_out = data_out_channel;
auto dr0 = r0;
auto dr1 = r1;
float coef_h = 0.5f;
if (h == 0) {
dr0 = zero_ptr;
dr1 = r0;
r0 = r1;
r1 = r0 + win;
if (exclusive) {
coef_h = 1.f;
}
} else {
r0 = r1 + win;
r1 = r0 + win;
}
if (h * S + K - P > hin) {
dr1 = zero_ptr;
if (exclusive) {
coef_h = 1.f;
}
if (h * S + K - P > hin + 1) {
memset(dr_out, 0, wout * sizeof(float));
continue;
}
}
float coef_left_most = exclusive ? coef_h : coef_h / 2;
float32x4_t vcoef = vdupq_n_f32(coef_h / 2);
float coef_left[4] = {
coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2};
float32x4_t vcoef_left = vld1q_f32(coef_left);
int cnt_num = w_unroll_size;
if (w_unroll_size > 0) {
#ifdef __aarch64__
asm volatile(
P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vcoef] "w"(vcoef),
[vzero] "w"(vzero),
[vcoef_left] "w"(vcoef_left)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
#else
asm volatile(
P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vcoef] "w"(vcoef),
[vzero] "w"(vzero),
[vcoef_left] "w"(vcoef_left)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
#endif
dr0 -= 8;
dr1 -= 8;
}
// deal with right pad
int wstart = w_unroll_size * 4 * S - P;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, win);
int st = wstart > 0 ? wstart : 0;
float tmp = 0.f;
float coef = coef_h / 2;
if (exclusive && wend - st == 1) {
coef = coef_h;
}
for (int i = 0; i < wend - st; i++) {
tmp += dr0[i] + dr1[i];
}
*(dr_out++) = tmp * coef;
dr0 += S - (st - wstart);
dr1 += S - (st - wstart);
wstart += S;
}
data_out_channel += wout;
}
}
}
TargetFree(TARGET(kARM), zero_ptr);
}
void pooling3x3s1p1_max(const float* din,
float* dout,
int num,
......@@ -2240,6 +2536,9 @@ void pooling3x3s2p0_max(const float* din,
w_unroll_remian = wout - w_unroll_size * 4;
}
int remain = w_unroll_remian - 1;
int right = wout * 2 + 1 - win; // if need right pad
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in;
......@@ -2266,6 +2565,7 @@ void pooling3x3s2p0_max(const float* din,
}
}
int cnt_num = w_unroll_size;
int cnt_remain = remain;
if (w_unroll_size > 0) {
#ifdef __aarch64__
asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
......@@ -2289,46 +2589,80 @@ void pooling3x3s2p0_max(const float* din,
"v9",
"v10",
"v11");
#else
asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr2] "+r"(dr2),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
:
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11");
#endif
dr0 -= 8;
dr1 -= 8;
dr2 -= 8;
}
// deal with right pad
int rem = win - (w_unroll_size * 4) * S;
int wstart = 0;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, rem);
float tmp = dr0[wstart]; // std::numeric_limits<float>::min();
for (int i = wstart; i < wend; i++) {
tmp = std::max(tmp, dr0[i]);
tmp = std::max(tmp, dr1[i]);
tmp = std::max(tmp, dr2[i]);
int rem = win - (w_unroll_size * 4) * S;
int wstart = 0;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, rem);
float tmp = dr0[wstart]; // std::numeric_limits<float>::min();
for (int i = wstart; i < wend; i++) {
tmp = std::max(tmp, dr0[i]);
tmp = std::max(tmp, dr1[i]);
tmp = std::max(tmp, dr2[i]);
}
*(dr_out++) = tmp;
wstart += S;
}
*(dr_out++) = tmp;
wstart += S;
#else
asm volatile(
P3x3S2P0_INIT P3x3S2P0_MAX
"cmp %[remain], #0 @cmp cnt_num\n"
"sub %[dr0], #32 @sub - 8\n"
"sub %[dr1], #32 @sub - 8\n"
"sub %[dr2], #32 @sub - 8\n"
"ble 4f @ble exit1\n"
"2: @mid loop\n"
"vld1.f32 {d0-d1}, [%[dr0]]! @load \n"
"vld1.f32 {d2-d3}, [%[dr1]]! @load \n"
"vld1.f32 {d4-d5}, [%[dr2]]! @load \n"
"vmov.f32 s3,s2 @mov \n"
"vmov.f32 s7,s6 @mov \n"
"vmov.f32 s11,s10 @mov \n"
"vmax.f32 q0, q0, q1 @max n"
"sub %[dr0], #8 @add w \n"
"sub %[dr1], #8 @add w \n"
"sub %[dr2], #8 @add w \n"
"vmax.f32 q0, q0, q2 @max \n"
"vpmax.f32 d0, d0, d1 @pmax \n"
"vpmax.f32 d0, d0, d0 @pmax \n"
"subs %[remain], #1 @subs \n"
"vst1.f32 d0[0], [%[dr_out]]! @vst \n"
"bne 2b @bne \n"
"4: @exit\n"
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr2] "+r"(dr2),
[dr_out] "+r"(dr_out),
[remain] "+r"(cnt_remain),
[cnt_num] "+r"(cnt_num)
:
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11");
if (right) {
int wstart = (w_unroll_size * 4 + remain) * S;
int wend = std::min(wstart + K, win);
float tmp = dr0[wstart]; // std::numeric_limits<float>::min();
for (int i = wstart; i < wend; i++) {
tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
tmp = std::max(tmp, dr2[i]);
}
*(dr_out++) = tmp;
}
#endif
}
r0 = r2;
......
......@@ -76,30 +76,55 @@ void pooling1x1s2p0_max(const float* din,
int pad_bottom,
int pad_right);
void pooling2x2s2_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
int pad_bottom,
int pad_right);
void pooling2x2s2_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive,
int pad_bottom,
int pad_right);
void pooling2x2s2p0_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
int pad_bottom,
int pad_right);
void pooling2x2s2p0_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive,
int pad_bottom,
int pad_right);
void pooling2x2s2p1_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
int pad_bottom,
int pad_right);
void pooling2x2s2p1_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive,
int pad_bottom,
int pad_right);
void pooling3x3s1p1_max(const float* din,
float* dout,
......
......@@ -50,13 +50,14 @@ class PoolingPE : public PE {
PoolingArgs args = {0};
args.mode = param_.type;
auto paddings = *param_.paddings;
args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
args.image.address = input->data<float16>();
args.image.channels = input->shape().channel();
args.image.height = input->shape().height();
args.image.width = input->shape().width();
args.image.pad_height = param_.paddings[0];
args.image.pad_width = param_.paddings[1];
args.image.pad_height = paddings[0];
args.image.pad_width = paddings[2];
args.image.scale_address = input->scale();
args.output.address = output->mutableData<float16>();
args.output.scale_address = output->scale();
......@@ -69,8 +70,7 @@ class PoolingPE : public PE {
param_.poolingArgs = args;
// use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1
// &&
// (k_width > 7 || k_height > 7);
// && (k_width > 7 || k_height > 7);
use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
(k_width > 255 || k_height > 255);
// use_cpu_ = param_.type == AVERAGE;
......@@ -86,12 +86,13 @@ class PoolingPE : public PE {
float* image_addr = float_input.mutableData<float>(FP32, input->shape());
float_input.copyFrom(input);
float16* data_out = output->data<float16>();
auto paddings = *param_.paddings;
int image_height = input->shape().height();
int image_width = input->shape().width();
int image_channels = input->shape().channel();
int image_pad_h = param_.paddings[0];
int image_pad_w = param_.paddings[1];
int image_pad_h = paddings[0];
int image_pad_w = paddings[2];
int kernel_height = param_.kernelSize[1];
int kernel_width = param_.kernelSize[0];
int kernel_step_h = param_.strides[0];
......
......@@ -58,6 +58,7 @@ void PoolCompute::Run() {
bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) &&
(ksize[1] == in_dims[3]) && kps_equal && pads_equal;
global_pooling = param.global_pooling || global_pooling;
if (global_pooling) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[2 * i] = 0;
......@@ -107,35 +108,65 @@ void PoolCompute::Run() {
} else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 &&
kps_equal) {
if (pooling_type == "max") {
lite::arm::math::pooling2x2s2_max(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3],
paddings[1],
paddings[3]);
lite::arm::math::pooling2x2s2p0_max(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3],
paddings[1],
paddings[3]);
return;
} else if (pooling_type == "avg") {
lite::arm::math::pooling2x2s2_avg(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3],
exclusive,
paddings[1],
paddings[3]);
lite::arm::math::pooling2x2s2p0_avg(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3],
exclusive,
paddings[1],
paddings[3]);
return;
}
} else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
} else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 1 &&
kps_equal) {
if (pooling_type == "max") {
lite::arm::math::pooling2x2s2p1_max(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3],
paddings[1],
paddings[3]);
return;
} else if (pooling_type == "avg") {
lite::arm::math::pooling2x2s2p1_avg(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3],
exclusive,
paddings[1],
paddings[3]);
return;
}
} else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
pads_equal && kps_equal) {
if (pooling_type == "max") {
lite::arm::math::pooling3x3s1p1_max(din,
dout,
......@@ -165,7 +196,7 @@ void PoolCompute::Run() {
return;
}
} else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
kps_equal) {
pads_equal && kps_equal) {
if (pooling_type == "max") {
lite::arm::math::pooling3x3s1p0_max(din,
dout,
......@@ -195,7 +226,7 @@ void PoolCompute::Run() {
return;
}
} else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
kps_equal) {
pads_equal && kps_equal) {
if (pooling_type == "max") {
lite::arm::math::pooling3x3s2p0_max(din,
dout,
......@@ -225,7 +256,7 @@ void PoolCompute::Run() {
return;
}
} else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
kps_equal) {
pads_equal && kps_equal) {
if (pooling_type == "max") {
lite::arm::math::pooling3x3s2p1_max(din,
dout,
......
......@@ -678,15 +678,9 @@ void resize(const uint8_t* src,
} else if (srcFormat == NV12 || srcFormat == NV21) {
nv21_resize(src, dst, srcw, srch, dstw, dsth);
return;
num = 1;
int hout = static_cast<int>(0.5 * dsth);
dsth += hout;
} else if (srcFormat == BGR || srcFormat == RGB) {
bgr_resize(src, dst, srcw, srch, dstw, dsth);
return;
w_in = srcw * 3;
w_out = dstw * 3;
num = 3;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4;
w_out = dstw * 4;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册