diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h index 3e52de1a105ff96c2fc93b6f0ce12f70078f4aa1..d57e0228db362f78eff95d369f7c2f27b67d5438 100644 --- a/mace/kernels/transpose.h +++ b/mace/kernels/transpose.h @@ -40,6 +40,7 @@ static void TransposeNHWCToNCHWC3(const float *input, index_t in_offset = h * width * 3; index_t out_offset = h * width; +#if defined(MACE_ENABLE_NEON) index_t w; for (w = 0; w + 3 < width; w += 4) { float32x4x3_t vi = vld3q_f32(input + in_offset); @@ -56,6 +57,13 @@ static void TransposeNHWCToNCHWC3(const float *input, input[h * width * 3 + w * 3 + c]; } } +#else + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 3; ++c) { + output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c]; + } + } +#endif } } @@ -69,13 +77,13 @@ static void TransposeNCHWToNHWCC2(const float *input, index_t in_offset = h * width; index_t out_offset = h * width * 2; +#if defined(MACE_ENABLE_NEON) index_t w; for (w = 0; w + 3 < width; w += 4) { float32x4_t vi0 = vld1q_f32(input + in_offset); float32x4_t vi1 = vld1q_f32(input + in_offset + image_size); float32x4x2_t vi = {vi0, vi1}; vst2q_f32(output + out_offset, vi); - in_offset += 4; out_offset += 8; } @@ -85,6 +93,13 @@ static void TransposeNCHWToNHWCC2(const float *input, input[h * width + image_size * c + w]; } } +#else + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 2; ++c) { + output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w]; + } + } +#endif } }