未验证 提交 4f5e4540 编写于 作者: A Adam 提交者: GitHub

Improve SGD jit code to work with large data (#23120)

上级 6fc6bb3e
...@@ -24,22 +24,41 @@ namespace operators { ...@@ -24,22 +24,41 @@ namespace operators {
namespace jit { namespace jit {
namespace gen { namespace gen {
void SgdJitCode::mainCode(int num_regs) {
constexpr size_t block_size = sizeof(float) * YMM_FLOAT_BLOCK;
// load grad
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i]);
add(reg_ptr_grad_i, block_size);
}
// load param
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i]);
add(reg_ptr_param_i, block_size);
}
// compute out
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr);
vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
}
// save out
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ptr[reg_ptr_out_i], ymm_t(reg_i + num_regs));
add(reg_ptr_out_i, block_size);
}
}
void SgdJitCode::genCode() { void SgdJitCode::genCode() {
preCode(); preCode();
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
constexpr int max_num_regs = 7; constexpr int max_num_regs = 7;
const int num_block = w_ / block; const int num_block = w_ / block;
const int num_groups = num_block / max_num_regs; const int num_groups = num_block / max_num_regs;
const size_t block_size = sizeof(float) * block;
const size_t width_size = w_ * sizeof(float);
std::vector<int> groups(num_groups, max_num_regs);
int rest_num_regs = num_block % max_num_regs; int rest_num_regs = num_block % max_num_regs;
if (rest_num_regs > 0) { const size_t width_size = w_ * sizeof(float);
groups.push_back(rest_num_regs);
}
vbroadcastss(ymm_lr, ptr[param_lr]); vbroadcastss(ymm_lr, ptr[param_lr]);
// protect rdx
mov(reg_ptr_grad_i, param_grad); mov(reg_ptr_grad_i, param_grad);
mov(reg_ptr_rows_i, param_rows); mov(reg_ptr_rows_i, param_rows);
...@@ -63,43 +82,27 @@ void SgdJitCode::genCode() { ...@@ -63,43 +82,27 @@ void SgdJitCode::genCode() {
add(reg_ptr_param_i, reg_row); add(reg_ptr_param_i, reg_row);
add(reg_ptr_out_i, reg_row); add(reg_ptr_out_i, reg_row);
size_t w_offset = 0; Label inner_loop;
for (int num_regs : groups) { Label escape_loop;
// load grad mov(rax, 0);
size_t inner_offfset = w_offset; L(inner_loop);
for (int reg_i = 0; reg_i < num_regs; ++reg_i) { {
vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]); cmp(rax, num_groups);
inner_offfset += block_size; jnb(escape_loop, T_NEAR);
}
// load param mainCode(max_num_regs);
inner_offfset = w_offset;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]);
inner_offfset += block_size;
}
// compute out inc(rax);
for (int reg_i = 0; reg_i < num_regs; ++reg_i) { jmp(inner_loop, T_NEAR);
vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr);
vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
} }
L(escape_loop);
mainCode(rest_num_regs);
// save out
inner_offfset = w_offset;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs));
inner_offfset += block_size;
}
w_offset += (block_size * num_regs);
}
add(reg_ptr_grad_i, width_size);
add(reg_ptr_rows_i, sizeof(int64_t)); add(reg_ptr_rows_i, sizeof(int64_t));
cmp(reg_ptr_rows_i, reg_rows_size_in_byte); cmp(reg_ptr_rows_i, reg_rows_size_in_byte);
jl(l_next_row, T_NEAR); jl(l_next_row, T_NEAR);
} }
postCode(); postCode();
} }
...@@ -109,9 +112,7 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> { ...@@ -109,9 +112,7 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
return platform::MayIUse(platform::avx) && return platform::MayIUse(platform::avx) &&
attr.grad_width % YMM_FLOAT_BLOCK == 0; attr.grad_width % YMM_FLOAT_BLOCK == 0;
} }
size_t CodeSize(const sgd_attr_t& attr) const override { size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8;
}
std::unique_ptr<GenBase> CreateJitCode( std::unique_ptr<GenBase> CreateJitCode(
const sgd_attr_t& attr) const override { const sgd_attr_t& attr) const override {
PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
......
...@@ -34,6 +34,7 @@ class SgdJitCode : public JitCode { ...@@ -34,6 +34,7 @@ class SgdJitCode : public JitCode {
DECLARE_JIT_CODE(SgdJitCode); DECLARE_JIT_CODE(SgdJitCode);
void genCode() override; void genCode() override;
void mainCode(int num_regs);
private: private:
int w_; int w_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册