提交 b7758556 编写于 作者: H HappyAngel 提交者: GitHub

[arm] fix conv_dw leakyRelu compute error (#2814)

[arm] fix conv_dw leakyRelu compute error
上级 42bbd157
...@@ -617,7 +617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -617,7 +617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fcmge v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_f32 */ \ "fcmge v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fcmge v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_f32 */ \ "fcmge v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */ \ "fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */ \
"fmul v21.4s, v12.4s, %[vscale].4s \n" /* mul */ \ "fmul v21.4s, v13.4s, %[vscale].4s \n" /* mul */ \
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
\ \
"fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
...@@ -1627,7 +1627,7 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -1627,7 +1627,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\ \
"vbif q4, q6, q15 @ choose \n" \ "vbif q4, q6, q15 @ choose \n" \
"vcge.f32 q7, q5, %q[vzero] @ q0 > 0 \n" \ "vcge.f32 q7, q5, %q[vzero] @ q0 > 0 \n" \
"vmul.f32 q6, q4, q14 \n" \ "vmul.f32 q6, q5, q14 \n" \
"vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \
"vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \
\ \
...@@ -1815,8 +1815,8 @@ void conv_depthwise_3x3s1_fp32(const float *din, ...@@ -1815,8 +1815,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"vmul.f32 q12, q14, q9 \n" \ "vmul.f32 q12, q14, q9 \n" \
"vmul.f32 q13, q15, q9 \n" \ "vmul.f32 q13, q15, q9 \n" \
\ \
"vbif q14, q10, q12 \n" \ "vbif q14, q12, q10 \n" \
"vbif q15, q11, q13 \n" \ "vbif q15, q13, q11 \n" \
\ \
"vst1.32 {d28-d29}, [%[out1]]\n" \ "vst1.32 {d28-d29}, [%[out1]]\n" \
"vst1.32 {d30-d31}, [%[out2]]\n" "vst1.32 {d30-d31}, [%[out2]]\n"
......
...@@ -209,9 +209,9 @@ namespace math { ...@@ -209,9 +209,9 @@ namespace math {
"fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ \ "fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ \
"fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \ "fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \
"bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \ "bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \
"bif v19.16b, v4.16b, v3.16b \n" /* choose*/ \ "bif v20.16b, v4.16b, v3.16b \n" /* choose*/ \
"bif v19.16b, v6.16b, v5.16b \n" /* choose*/ \ "bif v21.16b, v6.16b, v5.16b \n" /* choose*/ \
"bif v19.16b, v8.16b, v7.16b \n" /* choose*/ "bif v22.16b, v8.16b, v7.16b \n" /* choose*/
#define STORE /* save result */ \ #define STORE /* save result */ \
"str q19, [%[outc0]], #16\n" \ "str q19, [%[outc0]], #16\n" \
"str q20, [%[outc1]], #16\n" \ "str q20, [%[outc1]], #16\n" \
......
...@@ -306,8 +306,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims, ...@@ -306,8 +306,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
const float leakey_relu_scale) {} const float leakey_relu_scale) {}
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
// TODO(chenjiaoAngel): fix me, diff: 3x3 depthwise conv // TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
#if 0 /// 3x3dw #if 1 /// 3x3dw
TEST(TestConv3x3DW, test_conv3x3_depthwise) { TEST(TestConv3x3DW, test_conv3x3_depthwise) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
for (auto& stride : {1, 2}) { for (auto& stride : {1, 2}) {
...@@ -334,7 +334,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) { ...@@ -334,7 +334,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
{1, 1}, {1, 1},
flag_bias, flag_bias,
flag_act, flag_act,
{1, 2, 4}, {1},
{FLAGS_power_mode}, {FLAGS_power_mode},
leakey_relu_scale); leakey_relu_scale);
} }
...@@ -352,12 +352,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) { ...@@ -352,12 +352,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
#if 1 /// 5x5dw #if 1 /// 5x5dw
TEST(TestConv5x5DW, test_conv5x5_depthwise) { TEST(TestConv5x5DW, test_conv5x5_depthwise) {
if (FLAGS_basic_test) { if (FLAGS_basic_test) {
#ifdef __aarch64__
// TODO(chenjiaoAngel): fix me, diff: arm64 5x5s2 depthwise conv
for (auto& stride : {1}) {
#else
for (auto& stride : {1, 2}) { for (auto& stride : {1, 2}) {
#endif
for (auto& pad_left : {0, 1, 2}) { for (auto& pad_left : {0, 1, 2}) {
for (auto& pad_right : {0, 1, 2}) { for (auto& pad_right : {0, 1, 2}) {
for (auto& pad_top : {0, 1, 2}) { for (auto& pad_top : {0, 1, 2}) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册