提交 cab46b62 编写于 作者: T tensor-tang 提交者: ceci3

refine vbroadcast jitcode

test=develop
上级 6010361c
...@@ -37,36 +37,33 @@ void VBroadcastJitCode::genCode() { ...@@ -37,36 +37,33 @@ void VBroadcastJitCode::genCode() {
} }
// protect param_h // protect param_h
const size_t width_in_byte = sizeof(float) * w_;
mov(reg_height, param_h); mov(reg_height, param_h);
int acc_num_regs = 0; Label l_next_h;
for (int num_regs : groups) { xor_(reg_h_i, reg_h_i);
mov(reg_ptr_dst_i, param_dst);
L(l_next_h);
{
mov(reg_ptr_src_i, param_src); mov(reg_ptr_src_i, param_src);
add(reg_ptr_src_i, acc_num_regs * block_size); for (int num_regs : groups) {
size_t w_offset = 0; size_t w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) { for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]); vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
w_offset += block_size; w_offset += block_size;
} }
add(reg_ptr_src_i, num_regs * block_size);
Label l_next_h;
xor_(reg_h_i, reg_h_i);
mov(reg_ptr_dst_i, param_dst);
add(reg_ptr_dst_i, acc_num_regs * block_size);
L(l_next_h);
{
w_offset = 0; w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) { for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i)); vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
w_offset += block_size; w_offset += block_size;
} }
add(reg_ptr_dst_i, width_in_byte); add(reg_ptr_dst_i, num_regs * block_size);
} // end of groups
inc(reg_h_i); inc(reg_h_i);
cmp(reg_h_i, reg_height); cmp(reg_h_i, reg_height);
jl(l_next_h, T_NEAR); jl(l_next_h, T_NEAR);
} // end of l_next_h } // end of l_next_h
acc_num_regs += num_regs;
} // end of groups
postCode(); postCode();
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册