提交 7044cfa7 编写于 作者: T tensor-tang

add sgd jitcode and op test

test=develop
上级 8e041337
......@@ -32,3 +32,4 @@ USE_JITKERNEL_GEN(kSeqPool)
USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN(kHSum)
USE_JITKERNEL_GEN(kEmbSeqPool)
USE_JITKERNEL_GEN(kSgd)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/sgd.h"
#include <stddef.h> // offsetof
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void SgdJitCode::genCode() {
preCode();
constexpr int block = YMM_FLOAT_BLOCK;
constexpr int max_num_regs = 7;
const int num_block = w_ / block;
const int num_groups = num_block / max_num_regs;
const size_t block_size = sizeof(float) * block;
const size_t width_size = w_ * sizeof(float);
std::vector<int> groups(num_groups, max_num_regs);
int rest_num_regs = num_block % max_num_regs;
if (rest_num_regs > 0) {
groups.push_back(rest_num_regs);
}
vbroadcastss(ymm_lr, ptr[param_lr]);
// protect rdx
mov(reg_ptr_grad_i, param_grad);
mov(reg_ptr_rows_i, param_rows);
mov(reg_rows_size_in_byte,
qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]);
mov(rax, sizeof(int64_t));
mul(reg_rows_size_in_byte);
mov(reg_rows_size_in_byte, rax);
add(reg_rows_size_in_byte, reg_ptr_rows_i);
Label l_next_row;
L(l_next_row);
{
mov(reg_row, qword[reg_ptr_rows_i]);
mov(rax, width_size);
mul(reg_row);
mov(reg_row, rax);
mov(reg_ptr_param_i, param_param);
mov(reg_ptr_out_i, param_out);
add(reg_ptr_param_i, reg_row);
add(reg_ptr_out_i, reg_row);
size_t w_offset = 0;
for (int num_regs : groups) {
// load grad
size_t inner_offfset = w_offset;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]);
inner_offfset += block_size;
}
// load param
inner_offfset = w_offset;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]);
inner_offfset += block_size;
}
// compute out
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr);
vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
}
// save out
inner_offfset = w_offset;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs));
inner_offfset += block_size;
}
w_offset += (block_size * num_regs);
}
add(reg_ptr_grad_i, width_size);
add(reg_ptr_rows_i, sizeof(int64_t));
cmp(reg_ptr_rows_i, reg_rows_size_in_byte);
jl(l_next_row, T_NEAR);
}
postCode();
}
class SgdCreator : public JitCodeCreator<sgd_attr_t> {
public:
bool UseMe(const sgd_attr_t& attr) const override {
return platform::MayIUse(platform::avx) &&
attr.grad_width % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const sgd_attr_t& attr) const override {
return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8;
}
std::unique_ptr<GenBase> CreateJitCode(
const sgd_attr_t& attr) const override {
PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
return make_unique<SgdJitCode>(attr, CodeSize(attr));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class SgdJitCode : public JitCode {
public:
explicit SgdJitCode(const sgd_attr_t& attr, size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), w_(attr.grad_width) {
this->genCode();
}
DECLARE_JIT_CODE(SgdJitCode);
void genCode() override;
private:
int w_;
reg64_t param_lr{abi_param1};
reg64_t param_param{abi_param2};
reg64_t param_grad{abi_param3};
reg64_t param_rows{abi_param4};
reg64_t param_out{abi_param5};
reg64_t param_attr{abi_param6};
ymm_t ymm_lr = ymm_t(15);
reg64_t reg_ptr_grad_i{r10};
reg64_t reg_ptr_rows_i{r11};
reg64_t reg_rows_size_in_byte{r12};
reg64_t reg_row{r13};
reg64_t reg_ptr_param_i{r14};
reg64_t reg_ptr_out_i{r15};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
......@@ -24,17 +24,28 @@ from op_test import OpTest
class TestSGDOp(OpTest):
def setUp(self):
self.op_type = "sgd"
w = np.random.random((102, 105)).astype("float32")
g = np.random.random((102, 105)).astype("float32")
self.conf()
w = np.random.random((self.h, self.w)).astype("float32")
g = np.random.random((self.h, self.w)).astype("float32")
lr = np.array([0.1]).astype("float32")
self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
self.outputs = {'ParamOut': w - lr * g}
def conf(self):
self.h = 102
self.w = 105
def test_check_output(self):
self.check_output()
class TestSGDOpCase8X(TestSGDOp):
def conf(self):
self.h = 10
self.w = 64
class TestSparseSGDOp(unittest.TestCase):
def check_with_place(self, place):
scope = core.Scope()
......@@ -42,12 +53,12 @@ class TestSparseSGDOp(unittest.TestCase):
# create and initialize Grad Variable
height = 10
rows = [0, 4, 7]
row_numel = 12
self.conf()
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
np_array = np.ones((len(rows), row_numel)).astype("float32")
np_array = np.ones((len(rows), self.row_numel)).astype("float32")
np_array[0, 0] = 2.0
np_array[2, 8] = 4.0
......@@ -56,7 +67,7 @@ class TestSparseSGDOp(unittest.TestCase):
# create and initialize Param Variable
param = scope.var('Param').get_tensor()
param_array = np.full((height, row_numel), 5.0).astype("float32")
param_array = np.full((height, self.row_numel), 5.0).astype("float32")
param.set(param_array, place)
# create and initialize LeraningRate Variable
......@@ -98,6 +109,14 @@ class TestSparseSGDOp(unittest.TestCase):
for place in places:
self.check_with_place(place)
def conf(self):
self.row_numel = 12
class TestSparseSGDOpCase8X(TestSparseSGDOp):
def conf(self):
self.row_numel = 16
class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
def check_with_place(self, place):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册