提交 23275903 编写于 作者: H hjchen2

Fix bugs

上级 3a08cac8
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
// Inspired by https://arxiv.org/abs/1509.09308 and // Inspired by https://arxiv.org/abs/1509.09308 and
// https://github.com/andravin/wincnn Refered from nnpack and ncnn project // https://github.com/andravin/wincnn and refered from nnpack and ncnn project
#include "operators/math/pad.h" #include "operators/math/pad.h"
#include "operators/math/winograd/winograd_transform.h" #include "operators/math/winograd/winograd_transform.h"
...@@ -326,7 +326,6 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -326,7 +326,6 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
int width = input.dims()[3]; int width = input.dims()[3];
int h_tiles = (height + 3) / 6; // (height + 5 - 2) / 6 int h_tiles = (height + 3) / 6; // (height + 5 - 2) / 6
int w_tiles = (width + 3) / 6; // (width + 5 - 2) / 6 int w_tiles = (width + 3) / 6; // (width + 5 - 2) / 6
int tiles = (h_tiles * w_tiles + 7) / 8; int tiles = (h_tiles * w_tiles + 7) / 8;
framework::DDim transformed_shape = framework::DDim transformed_shape =
framework::make_ddim(std::vector<int>{tiles, 64, channel, 8}); framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
...@@ -549,10 +548,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -549,10 +548,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
"vsub.f32 q10, q9, q3 \n" "vsub.f32 q10, q9, q3 \n"
"vsub.f32 q11, q5, q7 \n" "vsub.f32 q11, q5, q7 \n"
"vmla.f32 q10, q11, d0[0] \n" "vmla.f32 q10, q11, d0[0] \n"
"vst1.32 {d24[0]}, [%[out0]], r1 \n" "vst1.32 {d20[0]}, [%[out0]], r1 \n"
"vst1.32 {d24[1]}, [%[out0]], r1 \n" "vst1.32 {d20[1]}, [%[out0]], r1 \n"
"vst1.32 {d25[0]}, [%[out0]], r1 \n" "vst1.32 {d21[0]}, [%[out0]], r1 \n"
"vst1.32 {d25[1]}, [%[out0]], %[steps] \n" "vst1.32 {d21[1]}, [%[out0]], %[steps] \n"
// col 1: // col 1:
"vld1.32 {d4-d5}, [%[ptr0]]! \n" // q2: d0 "vld1.32 {d4-d5}, [%[ptr0]]! \n" // q2: d0
...@@ -639,10 +638,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -639,10 +638,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
"vsub.f32 q10, q9, q3 \n" "vsub.f32 q10, q9, q3 \n"
"vsub.f32 q11, q5, q7 \n" "vsub.f32 q11, q5, q7 \n"
"vmla.f32 q10, q11, d0[0] \n" "vmla.f32 q10, q11, d0[0] \n"
"vst1.32 {d24[0]}, [%[out0]], r1 \n" "vst1.32 {d20[0]}, [%[out0]], r1 \n"
"vst1.32 {d24[1]}, [%[out0]], r1 \n" "vst1.32 {d20[1]}, [%[out0]], r1 \n"
"vst1.32 {d25[0]}, [%[out0]], r1 \n" "vst1.32 {d21[0]}, [%[out0]], r1 \n"
"vst1.32 {d25[1]}, [%[out0]], %[steps] \n" "vst1.32 {d21[1]}, [%[out0]], %[steps] \n"
"subs r0, #1 \n" "subs r0, #1 \n"
"bne loop_col_%= \n" "bne loop_col_%= \n"
...@@ -869,9 +868,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, ...@@ -869,9 +868,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
} }
} }
for (int i = 0; i < output->numel(); ++i) { // for (int c = 0; c < channel; ++c) {
DLOG << "TransInput[" << i << "] = " << outptr[i]; // for (int tile = 0; tile < output->numel()/channel/64; ++tile) {
} // for (int i = 0; i < 64; ++i) {
// int offset = (((tile / 8) * 64 + i) * channel + c) * 8 + (tile % 8);
// DLOG << "TransInput[" << i << "] = " << outptr[offset];
// }
// }
// }
} }
template <> template <>
...@@ -896,9 +900,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -896,9 +900,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
for (int i = 0; i < out_channel; ++i) { for (int i = 0; i < out_channel; ++i) {
float *uv_ptr = uv_trans_ptr + (i * tiles * 64 * 32); float *uv_ptr = uv_trans_ptr + (i * tiles * 64 * 32);
for (int k = 0; k < 64; ++k) { for (int k = 0; k < 64; ++k) {
const float *w_ptr = weight_ptr + (i * 64 + k) * in_channel * 4;
for (int j = 0; j < tiles; ++j) { for (int j = 0; j < tiles; ++j) {
const float *in_ptr = input_ptr + (k * tiles + j) * in_channel * 8; const float *w_ptr = weight_ptr + (i * 64 + k) * in_channel * 4;
const float *in_ptr = input_ptr + (j * 64 + k) * in_channel * 8;
float *out0 = uv_ptr + (8 * j) * 64 + k; // out channel 0 float *out0 = uv_ptr + (8 * j) * 64 + k; // out channel 0
float *out1 = out0 + 8 * tiles * 64; // out channel 1 float *out1 = out0 + 8 * tiles * 64; // out channel 1
float *out2 = out1 + 8 * tiles * 64; // out channel 2 float *out2 = out1 + 8 * tiles * 64; // out channel 2
...@@ -1013,9 +1017,15 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -1013,9 +1017,15 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
} }
} }
for (int i = 0; i < uv_trans.numel(); ++i) { // for (int c = 0; c < 4 * out_channel; ++c) {
DLOG << "uv_trans[" << i << "] = " << uv_trans_ptr[i]; // for (int tile = 0; tile < 8 * tiles; ++tile) {
} // for (int i = 0; i < 64; ++i) {
// int offset = (c * 8 * tiles + tile) * 64 + i;
// DLOG << "uv_trans[" << i << "] = " << uv_trans_ptr[offset];
// }
// }
// }
/* /*
* s0 = m0 + (m1 + m2) + (m3 + m4) + 32 * (m5 + m6) * s0 = m0 + (m1 + m2) + (m3 + m4) + 32 * (m5 + m6)
* s1 = (m1 - m2) + 2 * (m3 - m4) + 16 * (m5 - m6) * s1 = (m1 - m2) + 2 * (m3 - m4) + 16 * (m5 - m6)
...@@ -1035,11 +1045,11 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -1035,11 +1045,11 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
int uv_image_size = uv_trans.dims()[1] * 64; int uv_image_size = uv_trans.dims()[1] * 64;
float transform_matrix[8] = {2.f, 4.f, 8.f, 16.f}; float transform_matrix[8] = {2.f, 4.f, 8.f, 16.f};
DLOG << "out_channel: " << out_channel; // DLOG << "out_channel: " << out_channel;
DLOG << "h_tiles: " << h_tiles; // DLOG << "h_tiles: " << h_tiles;
DLOG << "w_tiles: " << w_tiles; // DLOG << "w_tiles: " << w_tiles;
DLOG << "remain_h: " << remain_h; // DLOG << "remain_h: " << remain_h;
DLOG << "remain_w: " << remain_w; // DLOG << "remain_w: " << remain_w;
for (int oc = 0; oc < out_channel; ++oc) { for (int oc = 0; oc < out_channel; ++oc) {
float at_m[48]; // [6][8] float at_m[48]; // [6][8]
...@@ -1108,9 +1118,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -1108,9 +1118,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
: [tm_ptr] "r"((float *)transform_matrix) : [tm_ptr] "r"((float *)transform_matrix)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0");
for (int i = 0; i < 48; ++i) { // for (int i = 0; i < 48; ++i) {
DLOG << "at_m[" << i << "] = " << at_m[i]; // DLOG << "at_m[" << i << "] = " << at_m[i];
} // }
float *at_m_ptr0 = at_m; float *at_m_ptr0 = at_m;
float *at_m_ptr1 = at_m + 24; float *at_m_ptr1 = at_m + 24;
...@@ -1242,9 +1252,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -1242,9 +1252,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
float *out_ptr = output_ptr + offset; float *out_ptr = output_ptr + offset;
int remain_row = (tile_h < h_tiles - 1) ? 6 : remain_h; int remain_row = (tile_h < h_tiles - 1) ? 6 : remain_h;
int remain_col = (tile_w < w_tiles - 1) ? 6 : remain_w; int remain_col = (tile_w < w_tiles - 1) ? 6 : remain_w;
for (int i = 0; i < 36; ++i) { // for (int i = 0; i < 36; ++i) {
DLOG << "output_tmp[" << i << "] = " << output_tmp[i]; // DLOG << "output_tmp[" << i << "] = " << output_tmp[i];
} // }
for (int i = 0; i < remain_row; ++i, out_ptr += out_w) { for (int i = 0; i < remain_row; ++i, out_ptr += out_w) {
memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float)); memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float));
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册