“d6adf881faae799c2e9f344592856ac7361e5e7c”上不存在“micro/include/utils/macros.h”
提交 ea2da73a 编写于 作者: L liutuo

fix deconv neon bug

上级 5059f1c0
...@@ -319,7 +319,7 @@ void Deconv2dNeonK3x3S2(const float *input, ...@@ -319,7 +319,7 @@ void Deconv2dNeonK3x3S2(const float *input,
index_t j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (index_t n = 0; n + 9 < outw; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// out row 0 // out row 0
...@@ -365,6 +365,7 @@ void Deconv2dNeonK3x3S2(const float *input, ...@@ -365,6 +365,7 @@ void Deconv2dNeonK3x3S2(const float *input,
out_row_0 += 8; out_row_0 += 8;
out_row_1 += 8; out_row_1 += 8;
out_row_2 += 8; out_row_2 += 8;
j += 4;
} }
#endif #endif
for (; j < w; ++j) { for (; j < w; ++j) {
......
...@@ -32,12 +32,12 @@ void Deconv2dNeonK4x4S1(const float *input, ...@@ -32,12 +32,12 @@ void Deconv2dNeonK4x4S1(const float *input,
const index_t outch = out_shape[1]; const index_t outch = out_shape[1];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int oc = 0; oc < outch; oc += 2) { for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) { if (oc + 1 < outch) {
float *out_base = output + (b * outch + oc) * out_img_size; float *out_base = output + (b * outch + oc) * out_img_size;
float *out_base1 = out_base + out_img_size; float *out_base1 = out_base + out_img_size;
for (int q = 0; q < inch; q++) { for (index_t q = 0; q < inch; q++) {
const float *input_base = input + (b * inch + q) * h * w; const float *input_base = input + (b * inch + q) * h * w;
const float *in = input_base; const float *in = input_base;
const float *kernel_base = filter + (oc * inch + q) * 16; const float *kernel_base = filter + (oc * inch + q) * 16;
...@@ -62,7 +62,7 @@ void Deconv2dNeonK4x4S1(const float *input, ...@@ -62,7 +62,7 @@ void Deconv2dNeonK4x4S1(const float *input,
float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k12_vec = vld1q_f32(k12);
float32x4_t k13_vec = vld1q_f32(k13); float32x4_t k13_vec = vld1q_f32(k13);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
...@@ -77,7 +77,7 @@ void Deconv2dNeonK4x4S1(const float *input, ...@@ -77,7 +77,7 @@ void Deconv2dNeonK4x4S1(const float *input,
float *out_row1_2 = out_row1_1 + outw; float *out_row1_2 = out_row1_1 + outw;
float *out_row1_3 = out_row1_2 + outw; float *out_row1_3 = out_row1_2 + outw;
int j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
...@@ -252,7 +252,7 @@ void Deconv2dNeonK4x4S1(const float *input, ...@@ -252,7 +252,7 @@ void Deconv2dNeonK4x4S1(const float *input,
} }
} else { } else {
float *out_base = output + (b * outch + oc) * out_img_size; float *out_base = output + (b * outch + oc) * out_img_size;
for (int q = 0; q < inch; q++) { for (index_t q = 0; q < inch; q++) {
const float *input_base = input + (b * inch + q) * h * w; const float *input_base = input + (b * inch + q) * h * w;
const float *kernel_base = filter + (oc * inch + q) * 16; const float *kernel_base = filter + (oc * inch + q) * 16;
const float *in = input_base; const float *in = input_base;
...@@ -266,7 +266,7 @@ void Deconv2dNeonK4x4S1(const float *input, ...@@ -266,7 +266,7 @@ void Deconv2dNeonK4x4S1(const float *input,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + outw;
...@@ -387,10 +387,10 @@ void Deconv2dNeonK4x4S2(const float *input, ...@@ -387,10 +387,10 @@ void Deconv2dNeonK4x4S2(const float *input,
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int p = 0; p < outch; p++) { for (index_t p = 0; p < outch; p++) {
float *out_base = output + (b * outch + p) * out_img_size; float *out_base = output + (b * outch + p) * out_img_size;
for (int q = 0; q < inch; q++) { for (index_t q = 0; q < inch; q++) {
const float *input_base = input + (b * inch + q) * h * w; const float *input_base = input + (b * inch + q) * h * w;
const float *kernel_base = filter + (p * inch + q) * 16; const float *kernel_base = filter + (p * inch + q) * 16;
const float *in = input_base; const float *in = input_base;
...@@ -405,7 +405,7 @@ void Deconv2dNeonK4x4S2(const float *input, ...@@ -405,7 +405,7 @@ void Deconv2dNeonK4x4S2(const float *input,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + 2 * i * outw; float *out_row = out_base + 2 * i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
...@@ -413,9 +413,9 @@ void Deconv2dNeonK4x4S2(const float *input, ...@@ -413,9 +413,9 @@ void Deconv2dNeonK4x4S2(const float *input,
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + outw;
int j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (index_t n = 0; n + 9 < outw; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// row 0 // row 0
...@@ -479,6 +479,7 @@ void Deconv2dNeonK4x4S2(const float *input, ...@@ -479,6 +479,7 @@ void Deconv2dNeonK4x4S2(const float *input,
out_row_1 += 8; out_row_1 += 8;
out_row_2 += 8; out_row_2 += 8;
out_row_3 += 8; out_row_3 += 8;
j += 4;
} }
#endif #endif
for (; j < w; j++) { for (; j < w; j++) {
......
...@@ -163,7 +163,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input, ...@@ -163,7 +163,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
index_t j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (index_t n = 0; n + 9 < outw; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// out row 0 // out row 0
...@@ -209,6 +209,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input, ...@@ -209,6 +209,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
out_row_0 += 8; out_row_0 += 8;
out_row_1 += 8; out_row_1 += 8;
out_row_2 += 8; out_row_2 += 8;
j += 4;
} }
#endif #endif
for (; j < w; ++j) { for (; j < w; ++j) {
...@@ -554,7 +555,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input, ...@@ -554,7 +555,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
index_t j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (index_t n = 0; n + 9 < outw; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// out row 0 // out row 0
...@@ -600,6 +601,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input, ...@@ -600,6 +601,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
out_row_0 += 8; out_row_0 += 8;
out_row_1 += 8; out_row_1 += 8;
out_row_2 += 8; out_row_2 += 8;
j += 4;
} }
#endif #endif
for (; j < w; ++j) { for (; j < w; ++j) {
......
...@@ -34,8 +34,8 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input, ...@@ -34,8 +34,8 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c; const index_t offset = b * channels + c;
float *out_base = output + offset * out_img_size; float *out_base = output + offset * out_img_size;
const float *input_base = input + offset * in_img_size; const float *input_base = input + offset * in_img_size;
...@@ -51,13 +51,13 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input, ...@@ -51,13 +51,13 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + outw;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + outw;
int j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
...@@ -170,8 +170,8 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, ...@@ -170,8 +170,8 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c; const index_t offset = b * channels + c;
float *out_base = output + offset * out_img_size; float *out_base = output + offset * out_img_size;
const float *input_base = input + offset * in_img_size; const float *input_base = input + offset * in_img_size;
...@@ -188,7 +188,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, ...@@ -188,7 +188,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + 2 * i * outw; float *out_row = out_base + 2 * i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
...@@ -196,9 +196,9 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, ...@@ -196,9 +196,9 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + outw;
int j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (index_t n = 0; n + 9 < outw; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// row 0 // row 0
...@@ -262,6 +262,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, ...@@ -262,6 +262,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
out_row_1 += 8; out_row_1 += 8;
out_row_2 += 8; out_row_2 += 8;
out_row_3 += 8; out_row_3 += 8;
j += 4;
} }
#endif #endif
for (; j < w; j++) { for (; j < w; j++) {
...@@ -304,15 +305,15 @@ void GroupDeconv2dNeonK4x4S1(const float *input, ...@@ -304,15 +305,15 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const index_t outch_g = outch / group; const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
for (int b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) { for (int g = 0; g < group; ++g) {
for (int oc = 0; oc < outch_g; oc += 2) { for (index_t oc = 0; oc < outch_g; oc += 2) {
if (oc + 1 < outch_g) { if (oc + 1 < outch_g) {
const index_t out_offset = const index_t out_offset =
(b * outch + outch_g * g + oc) * out_img_size; (b * outch + outch_g * g + oc) * out_img_size;
float *out_base = output + out_offset; float *out_base = output + out_offset;
float *out_base1 = out_base + out_img_size; float *out_base1 = out_base + out_img_size;
for (int ic = 0; ic < inch_g; ic++) { for (index_t ic = 0; ic < inch_g; ic++) {
const index_t in_offset = const index_t in_offset =
(b * inch + inch_g * g + ic) * in_img_size; (b * inch + inch_g * g + ic) * in_img_size;
const float *input_base = input + in_offset; const float *input_base = input + in_offset;
...@@ -341,7 +342,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, ...@@ -341,7 +342,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k12_vec = vld1q_f32(k12);
float32x4_t k13_vec = vld1q_f32(k13); float32x4_t k13_vec = vld1q_f32(k13);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
...@@ -356,7 +357,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, ...@@ -356,7 +357,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
float *out_row1_2 = out_row1_1 + outw; float *out_row1_2 = out_row1_1 + outw;
float *out_row1_3 = out_row1_2 + outw; float *out_row1_3 = out_row1_2 + outw;
int j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
...@@ -533,7 +534,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, ...@@ -533,7 +534,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const index_t out_offset = const index_t out_offset =
(b * outch + outch_g * g + oc) * out_img_size; (b * outch + outch_g * g + oc) * out_img_size;
float *out_base = output + out_offset; float *out_base = output + out_offset;
for (int ic = 0; ic < inch_g; ++ic) { for (index_t ic = 0; ic < inch_g; ++ic) {
const index_t in_offset = const index_t in_offset =
(b * inch + inch_g * g + ic) * in_img_size; (b * inch + inch_g * g + ic) * in_img_size;
const index_t kernel_offset = const index_t kernel_offset =
...@@ -552,13 +553,13 @@ void GroupDeconv2dNeonK4x4S1(const float *input, ...@@ -552,13 +553,13 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + outw;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + outw;
int j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
...@@ -679,13 +680,13 @@ void GroupDeconv2dNeonK4x4S2(const float *input, ...@@ -679,13 +680,13 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
const index_t outch_g = outch / group; const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3)
for (int b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) { for (int g = 0; g < group; ++g) {
for (int oc = 0; oc < outch_g; oc++) { for (index_t oc = 0; oc < outch_g; oc++) {
const index_t out_offset = const index_t out_offset =
(b * outch + outch_g * g + oc) * out_img_size; (b * outch + outch_g * g + oc) * out_img_size;
float *out_base = output + out_offset; float *out_base = output + out_offset;
for (int ic = 0; ic < inch_g; ic++) { for (index_t ic = 0; ic < inch_g; ic++) {
const index_t in_offset = const index_t in_offset =
(b * inch + inch_g * g + ic) * in_img_size; (b * inch + inch_g * g + ic) * in_img_size;
const index_t kernel_offset = const index_t kernel_offset =
...@@ -704,7 +705,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input, ...@@ -704,7 +705,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
#endif #endif
for (int i = 0; i < h; i++) { for (index_t i = 0; i < h; i++) {
float *out_row = out_base + 2 * i * outw; float *out_row = out_base + 2 * i * outw;
float *out_row_0 = out_row; float *out_row_0 = out_row;
...@@ -712,9 +713,9 @@ void GroupDeconv2dNeonK4x4S2(const float *input, ...@@ -712,9 +713,9 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + outw;
int j = 0; index_t j = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) { for (index_t n = 0; n + 9 < outw; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// row 0 // row 0
...@@ -778,6 +779,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input, ...@@ -778,6 +779,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
out_row_1 += 8; out_row_1 += 8;
out_row_2 += 8; out_row_2 += 8;
out_row_3 += 8; out_row_3 += 8;
j += 4;
} }
#endif #endif
for (; j < w; j++) { for (; j < w; j++) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册