diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc index 0495cf9315d7319947ece3aa9f152fb50f16c239..6df0c7badfee33aadbc385068bd1f781a63ab2b3 100644 --- a/mace/ops/arm/deconv_2d_neon_3x3.cc +++ b/mace/ops/arm/deconv_2d_neon_3x3.cc @@ -319,7 +319,7 @@ void Deconv2dNeonK3x3S2(const float *input, index_t j = 0; #if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { + for (index_t n = 0; n + 9 < outw; n += 8) { float32x4_t in_vec = vld1q_f32(in); // out row 0 @@ -365,6 +365,7 @@ void Deconv2dNeonK3x3S2(const float *input, out_row_0 += 8; out_row_1 += 8; out_row_2 += 8; + j += 4; } #endif for (; j < w; ++j) { diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc index bddb56f586bcfe2f44993f3e496531fd1c88f966..dd85896095d0922e02f3079809edd8972380f223 100644 --- a/mace/ops/arm/deconv_2d_neon_4x4.cc +++ b/mace/ops/arm/deconv_2d_neon_4x4.cc @@ -32,12 +32,12 @@ void Deconv2dNeonK4x4S1(const float *input, const index_t outch = out_shape[1]; const index_t out_img_size = outh * outw; #pragma omp parallel for collapse(2) - for (int b = 0; b < out_shape[0]; ++b) { - for (int oc = 0; oc < outch; oc += 2) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t oc = 0; oc < outch; oc += 2) { if (oc + 1 < outch) { float *out_base = output + (b * outch + oc) * out_img_size; float *out_base1 = out_base + out_img_size; - for (int q = 0; q < inch; q++) { + for (index_t q = 0; q < inch; q++) { const float *input_base = input + (b * inch + q) * h * w; const float *in = input_base; const float *kernel_base = filter + (oc * inch + q) * 16; @@ -62,7 +62,7 @@ void Deconv2dNeonK4x4S1(const float *input, float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k13_vec = vld1q_f32(k13); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + i * outw; float *out_row_0 = out_row; @@ -77,7 +77,7 @@ void Deconv2dNeonK4x4S1(const float *input, float *out_row1_2 = out_row1_1 + outw; float *out_row1_3 = out_row1_2 + outw; - int j = 0; + index_t j = 0; #if defined(MACE_ENABLE_NEON) for (; j + 3 < w; j += 4) { float32x4_t in_vec = vld1q_f32(in); @@ -252,7 +252,7 @@ void Deconv2dNeonK4x4S1(const float *input, } } else { float *out_base = output + (b * outch + oc) * out_img_size; - for (int q = 0; q < inch; q++) { + for (index_t q = 0; q < inch; q++) { const float *input_base = input + (b * inch + q) * h * w; const float *kernel_base = filter + (oc * inch + q) * 16; const float *in = input_base; @@ -266,7 +266,7 @@ void Deconv2dNeonK4x4S1(const float *input, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + i * outw; float *out_row_0 = out_row; float *out_row_1 = out_row_0 + outw; @@ -387,10 +387,10 @@ void Deconv2dNeonK4x4S2(const float *input, const index_t out_img_size = outh * outw; #pragma omp parallel for collapse(2) - for (int b = 0; b < out_shape[0]; ++b) { - for (int p = 0; p < outch; p++) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t p = 0; p < outch; p++) { float *out_base = output + (b * outch + p) * out_img_size; - for (int q = 0; q < inch; q++) { + for (index_t q = 0; q < inch; q++) { const float *input_base = input + (b * inch + q) * h * w; const float *kernel_base = filter + (p * inch + q) * 16; const float *in = input_base; @@ -405,7 +405,7 @@ void Deconv2dNeonK4x4S2(const float *input, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + 2 * i * outw; float *out_row_0 = out_row; @@ -413,9 +413,9 @@ void Deconv2dNeonK4x4S2(const float *input, float *out_row_2 = out_row_1 + outw; float *out_row_3 = out_row_2 + outw; - int j = 0; + index_t j = 0; #if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { + for (index_t n = 0; n + 9 < outw; n += 8) { float32x4_t in_vec = vld1q_f32(in); // row 0 @@ -479,6 +479,7 @@ void Deconv2dNeonK4x4S2(const float *input, out_row_1 += 8; out_row_2 += 8; out_row_3 += 8; + j += 4; } #endif for (; j < w; j++) { diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc index 8a90b9fcf732b9cbcf942ff68b66b8e48feae92f..4296fb407ad24bd1e5cda017b36847616061627e 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc @@ -163,7 +163,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input, index_t j = 0; #if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { + for (index_t n = 0; n + 9 < outw; n += 8) { float32x4_t in_vec = vld1q_f32(in); // out row 0 @@ -209,6 +209,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input, out_row_0 += 8; out_row_1 += 8; out_row_2 += 8; + j += 4; } #endif for (; j < w; ++j) { @@ -554,7 +555,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input, index_t j = 0; #if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { + for (index_t n = 0; n + 9 < outw; n += 8) { float32x4_t in_vec = vld1q_f32(in); // out row 0 @@ -600,6 +601,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input, out_row_0 += 8; out_row_1 += 8; out_row_2 += 8; + j += 4; } #endif for (; j < w; ++j) { diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc index 6ae7dbb1338819458aea7a714ebecac1f87c97d7..744e70243652c11036f8e992877e6ee3627f35f7 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc @@ -34,8 +34,8 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input, const index_t out_img_size = outh * outw; #pragma omp parallel for collapse(2) - for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { const index_t offset = b * channels + c; float *out_base = output + offset * out_img_size; const float *input_base = input + offset * in_img_size; @@ -51,13 +51,13 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + i * outw; float *out_row_0 = out_row; float *out_row_1 = out_row_0 + outw; float *out_row_2 = out_row_1 + outw; float *out_row_3 = out_row_2 + outw; - int j = 0; + index_t j = 0; #if defined(MACE_ENABLE_NEON) for (; j + 3 < w; j += 4) { float32x4_t in_vec = vld1q_f32(in); @@ -170,8 +170,8 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, const index_t out_img_size = outh * outw; #pragma omp parallel for collapse(2) - for (int b = 0; b < out_shape[0]; ++b) { - for (int c = 0; c < channels; ++c) { + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < channels; ++c) { const index_t offset = b * channels + c; float *out_base = output + offset * out_img_size; const float *input_base = input + offset * in_img_size; @@ -188,7 +188,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + 2 * i * outw; float *out_row_0 = out_row; @@ -196,9 +196,9 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, float *out_row_2 = out_row_1 + outw; float *out_row_3 = out_row_2 + outw; - int j = 0; + index_t j = 0; #if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { + for (index_t n = 0; n + 9 < outw; n += 8) { float32x4_t in_vec = vld1q_f32(in); // row 0 @@ -262,6 +262,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, out_row_1 += 8; out_row_2 += 8; out_row_3 += 8; + j += 4; } #endif for (; j < w; j++) { @@ -304,15 +305,15 @@ void GroupDeconv2dNeonK4x4S1(const float *input, const index_t outch_g = outch / group; #pragma omp parallel for collapse(3) - for (int b = 0; b < out_shape[0]; ++b) { + for (index_t b = 0; b < out_shape[0]; ++b) { for (int g = 0; g < group; ++g) { - for (int oc = 0; oc < outch_g; oc += 2) { + for (index_t oc = 0; oc < outch_g; oc += 2) { if (oc + 1 < outch_g) { const index_t out_offset = (b * outch + outch_g * g + oc) * out_img_size; float *out_base = output + out_offset; float *out_base1 = out_base + out_img_size; - for (int ic = 0; ic < inch_g; ic++) { + for (index_t ic = 0; ic < inch_g; ic++) { const index_t in_offset = (b * inch + inch_g * g + ic) * in_img_size; const float *input_base = input + in_offset; @@ -341,7 +342,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k13_vec = vld1q_f32(k13); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + i * outw; float *out_row_0 = out_row; @@ -356,7 +357,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, float *out_row1_2 = out_row1_1 + outw; float *out_row1_3 = out_row1_2 + outw; - int j = 0; + index_t j = 0; #if defined(MACE_ENABLE_NEON) for (; j + 3 < w; j += 4) { float32x4_t in_vec = vld1q_f32(in); @@ -533,7 +534,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, const index_t out_offset = (b * outch + outch_g * g + oc) * out_img_size; float *out_base = output + out_offset; - for (int ic = 0; ic < inch_g; ++ic) { + for (index_t ic = 0; ic < inch_g; ++ic) { const index_t in_offset = (b * inch + inch_g * g + ic) * in_img_size; const index_t kernel_offset = @@ -552,13 +553,13 @@ void GroupDeconv2dNeonK4x4S1(const float *input, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + i * outw; float *out_row_0 = out_row; float *out_row_1 = out_row_0 + outw; float *out_row_2 = out_row_1 + outw; float *out_row_3 = out_row_2 + outw; - int j = 0; + index_t j = 0; #if defined(MACE_ENABLE_NEON) for (; j + 3 < w; j += 4) { float32x4_t in_vec = vld1q_f32(in); @@ -679,13 +680,13 @@ void GroupDeconv2dNeonK4x4S2(const float *input, const index_t outch_g = outch / group; #pragma omp parallel for collapse(3) - for (int b = 0; b < out_shape[0]; ++b) { + for (index_t b = 0; b < out_shape[0]; ++b) { for (int g = 0; g < group; ++g) { - for (int oc = 0; oc < outch_g; oc++) { + for (index_t oc = 0; oc < outch_g; oc++) { const index_t out_offset = (b * outch + outch_g * g + oc) * out_img_size; float *out_base = output + out_offset; - for (int ic = 0; ic < inch_g; ic++) { + for (index_t ic = 0; ic < inch_g; ic++) { const index_t in_offset = (b * inch + inch_g * g + ic) * in_img_size; const index_t kernel_offset = @@ -704,7 +705,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input, float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k3_vec = vld1q_f32(k3); #endif - for (int i = 0; i < h; i++) { + for (index_t i = 0; i < h; i++) { float *out_row = out_base + 2 * i * outw; float *out_row_0 = out_row; @@ -712,9 +713,9 @@ void GroupDeconv2dNeonK4x4S2(const float *input, float *out_row_2 = out_row_1 + outw; float *out_row_3 = out_row_2 + outw; - int j = 0; + index_t j = 0; #if defined(MACE_ENABLE_NEON) - for (; j + 3 < w; j += 4) { + for (index_t n = 0; n + 9 < outw; n += 8) { float32x4_t in_vec = vld1q_f32(in); // row 0 @@ -778,6 +779,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input, out_row_1 += 8; out_row_2 += 8; out_row_3 += 8; + j += 4; } #endif for (; j < w; j++) {