提交 544f5dd0 编写于 作者: 李寅

Fix transpose host runtime

上级 58a20531
...@@ -40,6 +40,7 @@ static void TransposeNHWCToNCHWC3(const float *input, ...@@ -40,6 +40,7 @@ static void TransposeNHWCToNCHWC3(const float *input,
index_t in_offset = h * width * 3; index_t in_offset = h * width * 3;
index_t out_offset = h * width; index_t out_offset = h * width;
#if defined(MACE_ENABLE_NEON)
index_t w; index_t w;
for (w = 0; w + 3 < width; w += 4) { for (w = 0; w + 3 < width; w += 4) {
float32x4x3_t vi = vld3q_f32(input + in_offset); float32x4x3_t vi = vld3q_f32(input + in_offset);
...@@ -56,6 +57,13 @@ static void TransposeNHWCToNCHWC3(const float *input, ...@@ -56,6 +57,13 @@ static void TransposeNHWCToNCHWC3(const float *input,
input[h * width * 3 + w * 3 + c]; input[h * width * 3 + w * 3 + c];
} }
} }
#else
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < 3; ++c) {
output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
}
}
#endif
} }
} }
...@@ -69,13 +77,13 @@ static void TransposeNCHWToNHWCC2(const float *input, ...@@ -69,13 +77,13 @@ static void TransposeNCHWToNHWCC2(const float *input,
index_t in_offset = h * width; index_t in_offset = h * width;
index_t out_offset = h * width * 2; index_t out_offset = h * width * 2;
#if defined(MACE_ENABLE_NEON)
index_t w; index_t w;
for (w = 0; w + 3 < width; w += 4) { for (w = 0; w + 3 < width; w += 4) {
float32x4_t vi0 = vld1q_f32(input + in_offset); float32x4_t vi0 = vld1q_f32(input + in_offset);
float32x4_t vi1 = vld1q_f32(input + in_offset + image_size); float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
float32x4x2_t vi = {vi0, vi1}; float32x4x2_t vi = {vi0, vi1};
vst2q_f32(output + out_offset, vi); vst2q_f32(output + out_offset, vi);
in_offset += 4; in_offset += 4;
out_offset += 8; out_offset += 8;
} }
...@@ -85,6 +93,13 @@ static void TransposeNCHWToNHWCC2(const float *input, ...@@ -85,6 +93,13 @@ static void TransposeNCHWToNHWCC2(const float *input,
input[h * width + image_size * c + w]; input[h * width + image_size * c + w];
} }
} }
#else
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < 2; ++c) {
output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
}
}
#endif
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册