提交 2b32484a 编写于 作者: H hanbuhe

merge ..

...@@ -206,6 +206,20 @@ void pooling_basic(const float* din, ...@@ -206,6 +206,20 @@ void pooling_basic(const float* din,
"ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \ "ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
"ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ "ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/
#define P2x2S2P1_MAX \
"ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \
"ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \
"sub %[dr0], %[dr0], #4\n" /* sub */ \
"sub %[dr1], %[dr1], #4\n" /* sub */ \
"fmax v4.4s, v0.4s, v6.4s\n" /* max */ \
"fmax v5.4s, v2.4s, v8.4s\n" /* max */ \
"ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
"ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \
"fmax v6.4s, v4.4s, v5.4s\n" /* max reduce */ \
"subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \
"st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"ble 2f\n" /* bne s3_max_loop_mid */
#define P2x2S2P0_MAX \ #define P2x2S2P0_MAX \
"1: \n" \ "1: \n" \
"fmax v4.4s, v0.4s, v1.4s\n" /* max */ \ "fmax v4.4s, v0.4s, v1.4s\n" /* max */ \
...@@ -217,6 +231,21 @@ void pooling_basic(const float* din, ...@@ -217,6 +231,21 @@ void pooling_basic(const float* din,
"st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \ "st1 {v6.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"bne 1b\n" /* bne s3_max_loop_mid */ "bne 1b\n" /* bne s3_max_loop_mid */
#define P2x2S2P1_AVG \
"ext v6.16b, %[vzero].16b, v1.16b, #12\n" /* 1357-0135 */ \
"ext v8.16b, %[vzero].16b, v3.16b, #12\n" /* 1357-0135 */ \
"sub %[dr0], %[dr0], #4\n" /* sub */ \
"sub %[dr1], %[dr1], #4\n" /* sub */ \
"fadd v4.4s, v0.4s, v6.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
"fadd v5.4s, v2.4s, v8.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
"ld2 {v0.4s, v1.4s}, [%[dr0]], #32\n" /* load q0-q1, dr0, 0-7*/ \
"ld2 {v2.4s, v3.4s}, [%[dr1]], #32\n" /* load q2-q3, dr1, 0-7*/ \
"fadd v6.4s, v4.4s, v5.4s\n" /* add reduce */ \
"subs %w[cnt_num], %w[cnt_num], #1\n" /* subs cnt_num, #1*/ \
"fmul v4.4s, v6.4s, %[vcoef_left].4s\n" /* mul coef */ \
"st1 {v4.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"ble 2f\n" /* bne s3_max_loop_mid */
#define P2x2S2P0_AVG \ #define P2x2S2P0_AVG \
"1: \n" /* load bias to q2, q3*/ \ "1: \n" /* load bias to q2, q3*/ \
"fadd v4.4s, v0.4s, v1.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \ "fadd v4.4s, v0.4s, v1.4s\n" /* add 0, 2, 4, 6 and 1, 3, 5, 7 */ \
...@@ -228,6 +257,7 @@ void pooling_basic(const float* din, ...@@ -228,6 +257,7 @@ void pooling_basic(const float* din,
"fmul v4.4s, v6.4s, %[vcoef].4s\n" /* mul coef */ \ "fmul v4.4s, v6.4s, %[vcoef].4s\n" /* mul coef */ \
"st1 {v4.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \ "st1 {v4.4s}, [%[dr_out]], #16\n" /* store 4 out, dr_out */ \
"bne 1b\n" /* bne s3_max_loop_mid */ "bne 1b\n" /* bne s3_max_loop_mid */
#define P3x3S1_INIT \ #define P3x3S1_INIT \
"ldr q0, [%[dr0]], #16\n" /* load q0, dr0, 0-3*/ \ "ldr q0, [%[dr0]], #16\n" /* load q0, dr0, 0-3*/ \
"ldr q1, [%[dr1]], #16\n" /* load q1, dr1, 0-3*/ \ "ldr q1, [%[dr1]], #16\n" /* load q1, dr1, 0-3*/ \
...@@ -518,16 +548,45 @@ void pooling_basic(const float* din, ...@@ -518,16 +548,45 @@ void pooling_basic(const float* din,
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \ "vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" "vld2.f32 {d4-d7}, [%[dr1]]! @ load \n"
#define P2x2S2P1_MAX \
"vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \
"vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \
"sub %[dr0], #4 @sub \n" \
"sub %[dr1], #4 @sub \n" \
"vmax.f32 q8, q0, q4 @ max \n" \
"vmax.f32 q9, q2, q5 @ max \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \
"vmax.f32 q5, q9, q8 @ max reduce\n" \
"subs %[cnt_num], #1 @ subs cnt_num \n" \
"vst1.f32 {d10-d11}, [%[dr_out]]! @ store 4 out \n" \
"ble 2f @ bne \n"
#define P2x2S2P0_MAX \ #define P2x2S2P0_MAX \
"1: @ main loop\n" \ "1: @ main loop\n" \
"vmax.f32 q4, q0, q1 @ max \n" \ "vmax.f32 q4, q0, q1 @ max \n" \
"vmax.f32 q5, q2, q3 @ max \n" \ "vmax.f32 q5, q2, q3 @ max \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \ "vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \ "vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \
"vmax.f32 q6, q4, q5 @ max reduce\n" \ "vmax.f32 q8, q4, q5 @ max reduce\n" \
"subs %[cnt_num], #1 @ subs cnt_num \n" \ "subs %[cnt_num], #1 @ subs cnt_num \n" \
"vst1.f32 {d12-d13}, [%[dr_out]]! @ store 4 out \n" \ "vst1.f32 {d16-d17}, [%[dr_out]]! @ store 4 out \n" \
"bne 1b @ bne " "bne 1b @ bne \n"
#define P2x2S2P1_AVG \
"vext.32 q4, %q[vzero], q1, #3 @ 1357-0135\n" \
"vext.32 q5, %q[vzero], q3, #3 @ 1357-0135\n" \
"sub %[dr0], #4 @sub \n" \
"sub %[dr1], #4 @sub \n" \
"vadd.f32 q9, q0, q4 @ max \n" \
"vadd.f32 q8, q2, q5 @ max \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load \n" \
"vadd.f32 q5, q9, q8 @ max reduce\n" \
"subs %[cnt_num], #1 @ subs cnt_num \n" \
"vmul.f32 q4, q5, %q[vcoef_left] @ mul coef \n" \
"vst1.f32 {d8-d9}, [%[dr_out]]! @ store 4 out \n" \
"ble 2f @ bne\n"
#define P2x2S2P0_AVG \ #define P2x2S2P0_AVG \
"1: @ main loop\n" \ "1: @ main loop\n" \
...@@ -535,9 +594,9 @@ void pooling_basic(const float* din, ...@@ -535,9 +594,9 @@ void pooling_basic(const float* din,
"vadd.f32 q5, q2, q3 @ add 0, 2, 4, 6 \n" \ "vadd.f32 q5, q2, q3 @ add 0, 2, 4, 6 \n" \
"vld2.f32 {d0-d3}, [%[dr0]]! @ load d0-d3 \n" \ "vld2.f32 {d0-d3}, [%[dr0]]! @ load d0-d3 \n" \
"vld2.f32 {d4-d7}, [%[dr1]]! @ load d4-d7 \n" \ "vld2.f32 {d4-d7}, [%[dr1]]! @ load d4-d7 \n" \
"vadd.f32 q6, q4, q5 @ add reduce \n" \ "vadd.f32 q8, q4, q5 @ add reduce \n" \
"subs %[cnt_num], #1 @ subs \n" \ "subs %[cnt_num], #1 @ subs \n" \
"vmul.f32 q4, q6, %q[vcoef] @ mul coef \n" \ "vmul.f32 q4, q8, %q[vcoef] @ mul coef \n" \
"vst1.f32 {d8-d9}, [%[dr_out]]! @ store 4 out \n" \ "vst1.f32 {d8-d9}, [%[dr_out]]! @ store 4 out \n" \
"bne 1b @ bne \n" "bne 1b @ bne \n"
...@@ -1037,17 +1096,17 @@ void pooling1x1s2p0_max(const float* din, ...@@ -1037,17 +1096,17 @@ void pooling1x1s2p0_max(const float* din,
TargetFree(TARGET(kARM), write_ptr); TargetFree(TARGET(kARM), write_ptr);
} }
void pooling2x2s2_max(const float* din, void pooling2x2s2p0_max(const float* din,
float* dout, float* dout,
int num, int num,
int chout, int chout,
int hout, int hout,
int wout, int wout,
int chin, int chin,
int hin, int hin,
int win, int win,
int pad_bottom, int pad_bottom,
int pad_right) { int pad_right) {
int size_channel_out = wout * hout; int size_channel_out = wout * hout;
int size_channel_in = win * hin; int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout); auto data_out = static_cast<float*>(dout);
...@@ -1095,7 +1154,7 @@ void pooling2x2s2_max(const float* din, ...@@ -1095,7 +1154,7 @@ void pooling2x2s2_max(const float* din,
[dr_out] "+r"(dr_out), [dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num) [cnt_num] "+r"(cnt_num)
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
#endif #endif
dr0 -= 8; dr0 -= 8;
dr1 -= 8; dr1 -= 8;
...@@ -1121,18 +1180,18 @@ void pooling2x2s2_max(const float* din, ...@@ -1121,18 +1180,18 @@ void pooling2x2s2_max(const float* din,
} }
} }
void pooling2x2s2_avg(const float* din, void pooling2x2s2p0_avg(const float* din,
float* dout, float* dout,
int num, int num,
int chout, int chout,
int hout, int hout,
int wout, int wout,
int chin, int chin,
int hin, int hin,
int win, int win,
bool exclusive, bool exclusive,
int pad_bottom, int pad_bottom,
int pad_right) { int pad_right) {
int size_channel_out = wout * hout; int size_channel_out = wout * hout;
int size_channel_in = win * hin; int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout); auto data_out = static_cast<float*>(dout);
...@@ -1158,12 +1217,14 @@ void pooling2x2s2_avg(const float* din, ...@@ -1158,12 +1217,14 @@ void pooling2x2s2_avg(const float* din,
const float* data_in_channel = data_in_batch + c * size_channel_in; const float* data_in_channel = data_in_batch + c * size_channel_in;
const float* r0 = data_in_channel; const float* r0 = data_in_channel;
const float* r1 = r0 + win; const float* r1 = r0 + win;
vcoef = vdupq_n_f32(0.25f);
for (int h = 0; h < hout; h++) { for (int h = 0; h < hout; h++) {
float* dr_out = data_out_channel; float* dr_out = data_out_channel;
auto dr0 = r0; auto dr0 = r0;
auto dr1 = r1; auto dr1 = r1;
if (h * S + K - P > hin) { if (h * S + K - P > hin) {
dr1 = zero_ptr; dr1 = zero_ptr;
vcoef = vdupq_n_f32(0.5f);
} }
int cnt_num = w_unroll_size; int cnt_num = w_unroll_size;
if (w_unroll_size > 0) { if (w_unroll_size > 0) {
...@@ -1184,7 +1245,7 @@ void pooling2x2s2_avg(const float* din, ...@@ -1184,7 +1245,7 @@ void pooling2x2s2_avg(const float* din,
[dr_out] "+r"(dr_out), [dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num) [cnt_num] "+r"(cnt_num)
: [vcoef] "w"(vcoef) : [vcoef] "w"(vcoef)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8");
#endif #endif
dr0 -= 8; dr0 -= 8;
dr1 -= 8; dr1 -= 8;
...@@ -1194,8 +1255,14 @@ void pooling2x2s2_avg(const float* din, ...@@ -1194,8 +1255,14 @@ void pooling2x2s2_avg(const float* din,
int wstart = 0; int wstart = 0;
for (int j = 0; j < w_unroll_remian; ++j) { for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, rem); int wend = std::min(wstart + K, rem);
float coef = 0.5f / (wend - wstart); float coef = 0.25f;
float tmp = 0.f; float tmp = 0.f;
if (wend - wstart == 1 && pad_right == 0) {
coef *= 2;
}
if (h * S + K - P > hin && pad_bottom == 0) {
coef *= 2;
}
for (int i = wstart; i < wend; i++) { for (int i = wstart; i < wend; i++) {
tmp += dr0[i] + dr1[i]; tmp += dr0[i] + dr1[i];
} }
...@@ -1212,6 +1279,235 @@ void pooling2x2s2_avg(const float* din, ...@@ -1212,6 +1279,235 @@ void pooling2x2s2_avg(const float* din,
TargetFree(TARGET(kARM), zero_ptr); TargetFree(TARGET(kARM), zero_ptr);
} }
void pooling2x2s2p1_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
int pad_bottom,
int pad_right) {
int size_channel_out = wout * hout;
int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout);
auto data_in = static_cast<const float*>(din);
const int K = 2;
const int P = 1;
const int S = 2;
int w_unroll_size = wout / 4;
int w_unroll_remian = wout - w_unroll_size * 4;
float32x4_t vzero = vdupq_n_f32(std::numeric_limits<float>::lowest());
if (w_unroll_remian == 0) {
w_unroll_size -= 1;
w_unroll_remian = wout - w_unroll_size * 4;
}
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in;
#pragma omp parallel for
for (int c = 0; c < chout; c++) {
float* data_out_channel = data_out_batch + c * size_channel_out;
const float* data_in_channel = data_in_batch + c * size_channel_in;
const float* r0 = data_in_channel;
const float* r1 = r0 + win;
for (int h = 0; h < hout; h++) {
float* dr_out = data_out_channel;
auto dr0 = r0;
auto dr1 = r1;
if (h == 0) {
dr0 = r0;
dr1 = r0;
r0 = r1;
r1 = r0 + win;
} else {
r0 = r1 + win;
r1 = r0 + win;
}
if (h * S + K - P > hin) {
dr1 = dr0;
if (h * S + K - P > hin + 1) {
memset(dr_out, 0, wout * sizeof(float));
continue;
}
}
int cnt_num = w_unroll_size;
if (w_unroll_size > 0) {
#ifdef __aarch64__
asm volatile(
P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vzero] "w"(vzero)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
#else
asm volatile(
P2x2S2_INIT P2x2S2P1_MAX P2x2S2P0_MAX "2: \n" /* end */
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vzero] "w"(vzero)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
#endif
dr0 -= 8;
dr1 -= 8;
}
// deal with right pad
int wstart = w_unroll_size * 4 * S - P;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, win);
int st = wstart > 0 ? wstart : 0;
float tmp = wend == st ? 0.f : dr0[0];
for (int i = 0; i < wend - st; i++) {
tmp = std::max(tmp, dr0[i]);
tmp = std::max(tmp, dr1[i]);
}
*(dr_out++) = tmp;
dr0 += S - (st - wstart);
dr1 += S - (st - wstart);
wstart += S;
}
data_out_channel += wout;
}
}
}
}
void pooling2x2s2p1_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive,
int pad_bottom,
int pad_right) {
int size_channel_out = wout * hout;
int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout);
auto data_in = static_cast<const float*>(din);
const int K = 2;
const int P = 1;
const int S = 2;
int w_unroll_size = wout / 4;
int w_unroll_remian = wout - w_unroll_size * 4;
auto zero_ptr =
static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
float32x4_t vzero = vdupq_n_f32(0.f);
memset(zero_ptr, 0, win * sizeof(float));
if (w_unroll_remian == 0) {
w_unroll_size -= 1;
w_unroll_remian = wout - w_unroll_size * 4;
}
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in;
#pragma omp parallel for
for (int c = 0; c < chout; c++) {
float* data_out_channel = data_out_batch + c * size_channel_out;
const float* data_in_channel = data_in_batch + c * size_channel_in;
const float* r0 = data_in_channel;
const float* r1 = r0 + win;
for (int h = 0; h < hout; h++) {
float* dr_out = data_out_channel;
auto dr0 = r0;
auto dr1 = r1;
float coef_h = 0.5f;
if (h == 0) {
dr0 = zero_ptr;
dr1 = r0;
r0 = r1;
r1 = r0 + win;
if (exclusive) {
coef_h = 1.f;
}
} else {
r0 = r1 + win;
r1 = r0 + win;
}
if (h * S + K - P > hin) {
dr1 = zero_ptr;
if (exclusive) {
coef_h = 1.f;
}
if (h * S + K - P > hin + 1) {
memset(dr_out, 0, wout * sizeof(float));
continue;
}
}
float coef_left_most = exclusive ? coef_h : coef_h / 2;
float32x4_t vcoef = vdupq_n_f32(coef_h / 2);
float coef_left[4] = {
coef_left_most, coef_h / 2, coef_h / 2, coef_h / 2};
float32x4_t vcoef_left = vld1q_f32(coef_left);
int cnt_num = w_unroll_size;
if (w_unroll_size > 0) {
#ifdef __aarch64__
asm volatile(
P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vcoef] "w"(vcoef),
[vzero] "w"(vzero),
[vcoef_left] "w"(vcoef_left)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8");
#else
asm volatile(
P2x2S2_INIT P2x2S2P1_AVG P2x2S2P0_AVG "2: \n"
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
: [vcoef] "w"(vcoef),
[vzero] "w"(vzero),
[vcoef_left] "w"(vcoef_left)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9");
#endif
dr0 -= 8;
dr1 -= 8;
}
// deal with right pad
int wstart = w_unroll_size * 4 * S - P;
for (int j = 0; j < w_unroll_remian; ++j) {
int wend = std::min(wstart + K, win);
int st = wstart > 0 ? wstart : 0;
float tmp = 0.f;
float coef = coef_h / 2;
if (exclusive && wend - st == 1) {
coef = coef_h;
}
for (int i = 0; i < wend - st; i++) {
tmp += dr0[i] + dr1[i];
}
*(dr_out++) = tmp * coef;
dr0 += S - (st - wstart);
dr1 += S - (st - wstart);
wstart += S;
}
data_out_channel += wout;
}
}
}
TargetFree(TARGET(kARM), zero_ptr);
}
void pooling3x3s1p1_max(const float* din, void pooling3x3s1p1_max(const float* din,
float* dout, float* dout,
int num, int num,
...@@ -2240,6 +2536,9 @@ void pooling3x3s2p0_max(const float* din, ...@@ -2240,6 +2536,9 @@ void pooling3x3s2p0_max(const float* din,
w_unroll_remian = wout - w_unroll_size * 4; w_unroll_remian = wout - w_unroll_size * 4;
} }
int remain = w_unroll_remian - 1;
int right = wout * 2 + 1 - win; // if need right pad
for (int n = 0; n < num; ++n) { for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out; float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in; const float* data_in_batch = data_in + n * chin * size_channel_in;
...@@ -2266,6 +2565,7 @@ void pooling3x3s2p0_max(const float* din, ...@@ -2266,6 +2565,7 @@ void pooling3x3s2p0_max(const float* din,
} }
} }
int cnt_num = w_unroll_size; int cnt_num = w_unroll_size;
int cnt_remain = remain;
if (w_unroll_size > 0) { if (w_unroll_size > 0) {
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
...@@ -2289,46 +2589,80 @@ void pooling3x3s2p0_max(const float* din, ...@@ -2289,46 +2589,80 @@ void pooling3x3s2p0_max(const float* din,
"v9", "v9",
"v10", "v10",
"v11"); "v11");
#else
asm volatile(P3x3S2P0_INIT P3x3S2P0_MAX
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr2] "+r"(dr2),
[dr_out] "+r"(dr_out),
[cnt_num] "+r"(cnt_num)
:
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11");
#endif
dr0 -= 8; dr0 -= 8;
dr1 -= 8; dr1 -= 8;
dr2 -= 8; dr2 -= 8;
} int rem = win - (w_unroll_size * 4) * S;
// deal with right pad int wstart = 0;
int rem = win - (w_unroll_size * 4) * S; for (int j = 0; j < w_unroll_remian; ++j) {
int wstart = 0; int wend = std::min(wstart + K, rem);
for (int j = 0; j < w_unroll_remian; ++j) { float tmp = dr0[wstart]; // std::numeric_limits<float>::min();
int wend = std::min(wstart + K, rem); for (int i = wstart; i < wend; i++) {
float tmp = dr0[wstart]; // std::numeric_limits<float>::min(); tmp = std::max(tmp, dr0[i]);
for (int i = wstart; i < wend; i++) { tmp = std::max(tmp, dr1[i]);
tmp = std::max(tmp, dr0[i]); tmp = std::max(tmp, dr2[i]);
tmp = std::max(tmp, dr1[i]); }
tmp = std::max(tmp, dr2[i]); *(dr_out++) = tmp;
wstart += S;
} }
*(dr_out++) = tmp; #else
wstart += S; asm volatile(
P3x3S2P0_INIT P3x3S2P0_MAX
"cmp %[remain], #0 @cmp cnt_num\n"
"sub %[dr0], #32 @sub - 8\n"
"sub %[dr1], #32 @sub - 8\n"
"sub %[dr2], #32 @sub - 8\n"
"ble 4f @ble exit1\n"
"2: @mid loop\n"
"vld1.f32 {d0-d1}, [%[dr0]]! @load \n"
"vld1.f32 {d2-d3}, [%[dr1]]! @load \n"
"vld1.f32 {d4-d5}, [%[dr2]]! @load \n"
"vmov.f32 s3,s2 @mov \n"
"vmov.f32 s7,s6 @mov \n"
"vmov.f32 s11,s10 @mov \n"
"vmax.f32 q0, q0, q1 @max n"
"sub %[dr0], #8 @add w \n"
"sub %[dr1], #8 @add w \n"
"sub %[dr2], #8 @add w \n"
"vmax.f32 q0, q0, q2 @max \n"
"vpmax.f32 d0, d0, d1 @pmax \n"
"vpmax.f32 d0, d0, d0 @pmax \n"
"subs %[remain], #1 @subs \n"
"vst1.f32 d0[0], [%[dr_out]]! @vst \n"
"bne 2b @bne \n"
"4: @exit\n"
: [dr0] "+r"(dr0),
[dr1] "+r"(dr1),
[dr2] "+r"(dr2),
[dr_out] "+r"(dr_out),
[remain] "+r"(cnt_remain),
[cnt_num] "+r"(cnt_num)
:
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11");
if (right) {
int wstart = (w_unroll_size * 4 + remain) * S;
int wend = std::min(wstart + K, win);
float tmp = dr0[wstart]; // std::numeric_limits<float>::min();
for (int i = wstart; i < wend; i++) {
tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
tmp = std::max(tmp, dr2[i]);
}
*(dr_out++) = tmp;
}
#endif
} }
r0 = r2; r0 = r2;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void sigmoid(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) {
const int index = get_global_id(0);
if (index < count) {
out_data[index] = 1 / (1 + exp(-x_data[index]));
}
}
...@@ -42,7 +42,6 @@ void FcFuser::BuildPattern() { ...@@ -42,7 +42,6 @@ void FcFuser::BuildPattern() {
add->AsIntermediate(); add->AsIntermediate();
if (act_type_ != "") { if (act_type_ != "") {
std::cout << "act_type_:" << act_type_ << std::endl;
auto* add_out = VarNode("add_out"); auto* add_out = VarNode("add_out");
auto* activation = OpNode(act_type_, act_type_); auto* activation = OpNode(act_type_, act_type_);
std::vector<PMNode*> act_inputs{add_out}; std::vector<PMNode*> act_inputs{add_out};
...@@ -51,7 +50,6 @@ void FcFuser::BuildPattern() { ...@@ -51,7 +50,6 @@ void FcFuser::BuildPattern() {
add_out->AsIntermediate(); add_out->AsIntermediate();
activation->AsIntermediate(); activation->AsIntermediate();
} else { } else {
std::cout << "act_type_: empty" << std::endl;
add_inputs >> *add >> *Out; add_inputs >> *add >> *Out;
} }
} }
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册