Fix depthwise conv5x5 bugs for armv8

c02076f0 · hjchen2 · e7889e75 · c02076f0
隐藏空白更改
内联并排

Showing with 4 addition and 6 deletion

src/operators/math/depthwise_conv5x5.cpp src/operators/math/depthwise_conv5x5.cpp +4 -6

未找到文件。
--- a/src/operators/math/depthwise_conv5x5.cpp
+++ b/src/operators/math/depthwise_conv5x5.cpp
@@ -244,10 +244,9 @@ void DepthwiseConv5x5S1<float, float>(const framework::Tensor &input,
        output_ptr1 += valid_w_start;
      }
        // valid
-// #if __aarch64__
-#if 0
+#if __aarch64__
      float32x4_t _q14, _q15;
-      for (int loop = 0; loop = output_w_tiles; ++loop) {
+      for (int loop = 0; loop < output_w_tiles; ++loop) {
        float32x4_t _q7 = vld1q_f32(input_ptr0);
        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
        float32x4_t _q9 = vld1q_f32(input_ptr1);
@@ -759,10 +758,9 @@ void DepthwiseConv5x5S1<float, float>(const framework::Tensor &input,
        output_ptr0 += valid_w_start;
      }
        // valid
-// #if __aarch64__
-#if 0
+#if __aarch64__
      float32x4_t _q14;
-      for (int loop = 0; loop = output_w_tiles; ++loop) {
+      for (int loop = 0; loop < output_w_tiles; ++loop) {
        float32x4_t _q7 = vld1q_f32(input_ptr0);
        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
        float32x4_t _q9 = vld1q_f32(input_ptr1);