提交 a0c37662 编写于 作者: T tensor-tang

enable sgd jitkernel refer code and test

test=develop
上级 1dad36f6
......@@ -31,7 +31,8 @@ namespace gen {
// Application Binary Interface
constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
abi_param4(Xbyak::Operand::RCX);
abi_param4(Xbyak::Operand::RCX), abi_param5(Xbyak::Operand::R8),
abi_param6(Xbyak::Operand::R9);
constexpr Xbyak::Operand::Code g_abi_regs[] = {
Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
......
......@@ -55,6 +55,7 @@ const char* to_string(KernelType kt) {
ONE_CASE(kHSum);
ONE_CASE(kSoftmax);
ONE_CASE(kEmbSeqPool);
ONE_CASE(kSgd);
default:
PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
return "NOT JITKernel";
......
......@@ -181,6 +181,14 @@ inline std::ostream& operator<<(std::ostream& os,
return os;
}
inline std::ostream& operator<<(std::ostream& os, const sgd_attr_t& attr) {
os << "param_height[" << attr.param_height << "],param_width["
<< attr.param_width << "],grad_height[" << attr.grad_height
<< "],grad_width[" << attr.grad_width << "],selected_rows_size["
<< attr.selected_rows_size << "]";
return os;
}
inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
return os;
......
......@@ -46,6 +46,7 @@ typedef enum {
kVMul,
kVRelu,
kVScal,
kSgd,
kVSigmoid,
kVSquare,
kVSub,
......@@ -173,6 +174,28 @@ struct EmbSeqPoolTuples {
const emb_seq_pool_attr_t*);
};
typedef struct sgd_attr_s {
int64_t param_height, param_width;
int64_t grad_height, grad_width;
int64_t selected_rows_size;
sgd_attr_s() = default;
explicit sgd_attr_s(int64_t param_h, int64_t param_w, int64_t grad_h,
int64_t grad_w, int64_t selected_rows_sz)
: param_height(param_h),
param_width(param_w),
grad_height(grad_h),
grad_width(grad_w),
selected_rows_size(selected_rows_sz) {}
} sgd_attr_t;
template <typename T>
struct SgdTuples {
typedef T data_type;
typedef sgd_attr_t attr_type;
typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
const sgd_attr_t*);
};
typedef struct matmul_attr_s {
int m, n, k;
void* packed_weight{nullptr};
......
......@@ -61,6 +61,11 @@ size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
return attr.table_width;
}
template <>
size_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
return attr.grad_width;
}
} // namespace jit
} // namespace operators
} // namespace paddle
......@@ -33,3 +33,4 @@ USE_JITKERNEL_REFER(kHSum)
USE_JITKERNEL_REFER(kHMax)
USE_JITKERNEL_REFER(kSoftmax)
USE_JITKERNEL_REFER(kEmbSeqPool)
USE_JITKERNEL_REFER(kSgd)
......@@ -59,4 +59,6 @@ REGISTER_REFER_KERNEL(kSoftmax, Softmax);
REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
REGISTER_REFER_KERNEL(kSgd, Sgd);
#undef REGISTER_REFER_KERNEL
......@@ -446,6 +446,36 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
}
}
// SGD algorithm:
// lr is pointor of learning rate scalar
// param is an input matrix with (param_h, param_w)
// grad is an input matrix with (grad_h, grad_w), here grad_w == param_w
// selected_rows is a vectot<int64_t> with size selected_rows_size( <= grad_h )
// out is an output matrix with (param_h, param_w)
//
// support both regular and sparse grad
// regular SGD: out[:] = param[:] - lr[0] * grad[:];
// sparse SGD: out[rows[i]][:] = param[rows[i]][:] - lr[0] * grad[i][:]
//
// Note: when use sparse SGD, and if out != param,
// the out rows which are not selected have not beed changed, which maybe empty
template <typename T>
void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
T* out, const sgd_attr_t* attr) {
PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
auto h_idx = rows[i];
PADDLE_ENFORCE_LT(h_idx, attr->param_height);
PADDLE_ENFORCE_GE(h_idx, 0);
for (int64_t j = 0; j < attr->grad_width; ++j) {
out[h_idx * attr->grad_width + j] =
param[h_idx * attr->grad_width + j] -
lr[0] * grad[i * attr->grad_width + j];
}
}
}
#define DECLARE_REFER_KERNEL(name, tuples) \
template <typename T> \
class name##Kernel : public ReferKernel<tuples<T>> { \
......@@ -496,6 +526,8 @@ DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
DECLARE_REFER_KERNEL(Sgd, SgdTuples);
#undef DECLARE_REFER_KERNEL
} // namespace refer
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <random>
#include <string>
#include <vector>
......@@ -36,13 +37,13 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
}
template <typename T>
void ExpectEQ(const T* target, const T* refer, int n) {
void ExpectEQ(const T* target, const T* refer, size_t n) {
if (std::is_floating_point<T>::value) {
for (int i = 0; i < n; ++i) {
for (size_t i = 0; i < n; ++i) {
EXPECT_NEAR(target[i], refer[i], FLAGS_acc);
}
} else {
for (int i = 0; i < n; ++i) {
for (size_t i = 0; i < n; ++i) {
EXPECT_EQ(target[i], refer[i]);
}
}
......@@ -296,6 +297,45 @@ struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
}
};
template <typename T>
struct TestFuncWithRefer<jit::SgdTuples<T>, T, std::vector<T>, std::vector<T>,
std::vector<int64_t>, std::vector<T>,
typename jit::SgdTuples<T>::attr_type> {
void operator()(const typename jit::SgdTuples<T>::func_type tgt, const T lr,
const std::vector<T>& param, const std::vector<T>& grad,
const std::vector<int64_t>& rows, const std::vector<T>& oref,
const typename jit::SgdTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(param.size(),
static_cast<size_t>(attr.param_height * attr.param_width));
EXPECT_EQ(grad.size(),
static_cast<size_t>(attr.grad_height * attr.grad_width));
EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
EXPECT_EQ(param.size(), oref.size());
const T* param_data = param.data();
const T* grad_data = grad.data();
const int64_t* rows_data = rows.data();
const T* oref_data = oref.data();
std::vector<T> out(oref.size());
T* o_data = out.data();
tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
// only the selected rows should be equal
for (size_t i = 0; i < rows.size(); ++i) {
ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
oref_data + rows[i] * attr.grad_width, attr.grad_width);
}
// inplace
std::copy(param.begin(), param.end(), out.begin());
tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
for (size_t i = 0; i < rows.size(); ++i) {
ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
oref_data + rows[i] * attr.grad_width, attr.grad_width);
}
}
};
template <typename T>
struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
std::vector<T>,
......@@ -704,6 +744,60 @@ void TestEmbSeqPoolKernel() {
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestSgdKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> {
PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
PADDLE_ENFORCE_GT(n, 0);
std::vector<int64_t> all, out;
for (int i = 0; i < n; ++i) {
all.push_back(i);
}
std::random_shuffle(all.begin(), all.end());
out.insert(out.begin(), all.begin(), all.begin() + n);
return out;
};
for (int param_h : {1, 10}) {
for (int grad_w : TestSizes()) {
std::vector<T> param(param_h * grad_w);
std::vector<T> param_out(param_h * grad_w);
RandomVec<T>(param_h * grad_w, param.data(), -2.f, 2.f);
const T* param_data = param.data();
T* out_data = param_out.data();
for (int rows_size = 1; rows_size <= param_h; ++rows_size) {
std::vector<T> grad(rows_size * grad_w);
std::vector<int64_t> rows =
UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
RandomVec<T>(rows_size * grad_w, grad.data(), -2.f, 2.f);
const int64_t* rows_data = rows.data();
const T* grad_data = grad.data();
auto ref = jit::GetRefer<KT, jit::SgdTuples<T>>();
EXPECT_TRUE(ref != nullptr);
jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
ref(&lr, param_data, grad_data, rows_data, out_data, &attr);
// inplace test
std::vector<T> inp(param.size());
std::copy(param.begin(), param.end(), inp.begin());
T* inp_data = inp.data();
ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr);
// only the selected rows should be equal
for (int i = 0; i < rows_size; ++i) {
ExpectEQ<T>(inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w,
grad_w);
}
TestAllImpls<KT, jit::SgdTuples<T>, PlaceType, T, std::vector<T>,
std::vector<T>, std::vector<int64_t>, std::vector<T>>(
attr, lr, param, grad, rows, param_out, attr);
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestNCHW16CMulNCKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
......@@ -943,6 +1037,11 @@ TEST(JITKernel, kEmbSeqPool) {
TestEmbSeqPoolKernel<jit::kEmbSeqPool, double, CPUPlace>();
}
TEST(JITKernel, kSgd) {
TestSgdKernel<jit::kSgd, float, CPUPlace>();
TestSgdKernel<jit::kSgd, double, CPUPlace>();
}
TEST(JITKernel, kNCHW16CMulNC) {
TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, CPUPlace>();
TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/jit/kernels.h"
namespace paddle {
namespace operators {
......@@ -32,53 +33,57 @@ class SGDOpKernel : public framework::OpKernel<T> {
if (param_var->IsType<framework::LoDTensor>()) {
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
// Actually, all tensors are LoDTensor except SelectedRows.
if (grad_var->IsType<framework::LoDTensor>()) {
param_out->mutable_data<T>(ctx.GetPlace());
const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto p = framework::EigenVector<T>::Flatten(*param);
auto g = framework::EigenVector<T>::Flatten(*grad);
auto o = framework::EigenVector<T>::Flatten(*param_out);
auto *lr = learning_rate->data<T>();
o = p - lr[0] * g;
auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz);
PADDLE_ENFORCE_EQ(grad->numel(), sz);
jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>();
const T *param_data = param->data<T>();
const T *grad_data = grad->data<T>();
int64_t rows_idx = 0;
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
auto sgd =
jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
} else if (grad_var->IsType<framework::SelectedRows>()) {
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out);
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
auto &grad_rows = grad->rows();
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad->rows().size() == 0) {
if (grad_rows.size() == 0) {
return;
}
auto grad_height = grad->height();
auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]);
auto &grad_value = grad->value();
auto &grad_rows = grad->rows();
size_t grad_row_numel = grad_value.numel() / grad_rows.size();
PADDLE_ENFORCE_EQ(static_cast<int64_t>(grad_row_numel),
param_out->numel() / grad_height);
auto *grad_data = grad_value.data<T>();
auto *out_data = param_out->data<T>();
auto *lr = learning_rate->data<T>();
for (size_t i = 0; i < grad_rows.size(); i++) {
PADDLE_ENFORCE(grad_rows[i] < grad_height,
"Input rows index should less than height");
for (size_t j = 0; j < grad_row_numel; j++) {
out_data[grad_rows[i] * grad_row_numel + j] -=
lr[0] * grad_data[i * grad_row_numel + j];
}
}
const T *param_data = param->data<T>();
const T *grad_data = grad_value.data<T>();
const T *lr = learning_rate->data<T>();
const int64_t *rows_data = grad_rows.data();
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
jit::sgd_attr_t attr;
attr.param_height = out_dims[0];
attr.param_width = param_out->numel() / attr.param_height;
attr.grad_height = grad_rows.size(); // note: it is not grad->height()
attr.grad_width = grad_value.numel() / attr.grad_height;
attr.selected_rows_size = grad_rows.size();
PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
auto sgd =
jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
} else {
PADDLE_THROW("Unsupported Variable Type of Grad");
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册