未验证 提交 3ac6c189 编写于 作者: A Adam Osewski 提交者: GitHub

adds new CPU kernel for SGD op supporting BF16 data type (#32162)

* Initial draft for SGD BG16 kernel.

* Unit tests for SGD with BF16 data type.

* Add VLOG message to SGD BF16 op CPU kernel.

* Enhance error messages and error types.

* Refactor SGD op kernels to leverage some common code.

* Make easier to add new kerne invoke code.

* Fix SGD op kernel for sparse grad.

* Unify quotes style.

* Fix error for ROCM compilation.

* Use specialized PADDLE_ENFORCE_xx functions.
上级 7b9fcaca
......@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include <string>
#include "paddle/fluid/operators/optimizers/sgd_op.h"
namespace paddle {
namespace operators {
......@@ -127,4 +129,6 @@ REGISTER_OPERATOR(
ops::SGDOpInferVarType);
REGISTER_OP_CPU_KERNEL(
sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>,
ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -13,37 +13,148 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/bfloat16.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SGDOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override;
namespace detail {
template <typename T, int VariableTypeId>
struct sgd_dense_param_kernel {
void operator()() const {}
};
// LodTensor
template <typename T>
class SGDOpKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
struct sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::LoDTensor>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, LoDTensor>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad = ctx.Input<framework::Tensor>("Grad");
const auto *param_var = ctx.InputVar("Param");
const auto *grad_var = ctx.InputVar("Grad");
const auto sz = param_out->numel();
jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>();
const T *param_data = param->data<T>();
const T *grad_data = grad->data<T>();
int64_t rows_idx = 0;
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
if (param_var->IsType<framework::LoDTensor>()) {
auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
}
};
// SelectedRows
template <typename T>
struct sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::SelectedRows>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, SelectedRows>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
const auto &grad_value = grad->value();
const auto &grad_rows = grad->rows();
const T *param_data = param->data<T>();
const T *grad_data = grad_value.data<T>();
const T *lr = learning_rate->data<T>();
const int64_t *rows_data = grad_rows.data();
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
jit::sgd_attr_t attr;
attr.param_height = param_out->dims()[0];
attr.param_width = param_out->numel() / attr.param_height;
attr.grad_height = grad_rows.size(); // note: it is not grad->height()
attr.grad_width = grad_value.numel() / attr.grad_height;
attr.selected_rows_size = grad_rows.size();
auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
}
};
// LodTensor
template <>
struct sgd_dense_param_kernel<
platform::bfloat16, framework::VarTypeTrait<framework::LoDTensor>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, LoDTensor>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
// Actually, all tensors are LoDTensor except SelectedRows.
const auto *grad = ctx.Input<framework::Tensor>("Grad");
param_out->mutable_data<platform::bfloat16>(ctx.GetPlace());
auto p = framework::EigenVector<platform::bfloat16>::Flatten(*param);
auto g = framework::EigenVector<platform::bfloat16>::Flatten(*grad);
auto o = framework::EigenVector<platform::bfloat16>::Flatten(*param_out);
const auto *lr = learning_rate->data<platform::bfloat16>();
o = p - lr[0] * g;
}
};
// SelectedRows
template <>
struct sgd_dense_param_kernel<
platform::bfloat16, framework::VarTypeTrait<framework::SelectedRows>::kId> {
void operator()(const framework::ExecutionContext &ctx) const {
VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
const auto &grad_value = grad->value();
const auto &grad_rows = grad->rows();
const auto grad_height = grad->height();
const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
const auto grad_width = grad_value.numel() / grad_val_height;
const auto *grad_data = grad_value.data<platform::bfloat16>();
auto *out_data = param_out->data<platform::bfloat16>();
const auto *lr = learning_rate->data<platform::bfloat16>();
for (size_t i = 0; i < grad_rows.size(); ++i) {
PADDLE_ENFORCE_LT(
grad_rows[i], grad_height,
platform::errors::OutOfRange(
"Grad rows index value should be less than grad height."
"Got [%s], but expected less than [%s]",
grad_rows[i], grad_height));
const int64_t row = grad_rows[i];
for (int64_t j = 0; j < grad_width; ++j) {
out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
}
}
}
};
template <typename T>
void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) {
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
const auto *grad_var = ctx.InputVar("Grad");
if (grad_var->IsType<framework::LoDTensor>()) {
const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto sz = param_out->numel();
const auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz,
platform::errors::InvalidArgument(
"The input tensor Param's numel of SgdOp "
......@@ -59,17 +170,8 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
"numel = [%s], ParamOut's numel = [%s]",
grad->numel(), sz));
jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>();
const T *param_data = param->data<T>();
const T *grad_data = grad->data<T>();
int64_t rows_idx = 0;
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::LoDTensor>::kId>()(ctx);
} else if (grad_var->IsType<framework::SelectedRows>()) {
// TODO(qijun): In Sparse SGD operator, in-place update is enforced.
// This manual optimization brings difficulty to track data dependency.
......@@ -80,11 +182,10 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
"should be equal with ParamOut if variable's "
"type is SelectedRows. "));
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
auto &grad_rows = grad->rows();
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad_rows.size() == 0) {
if (grad->rows().size() == 0) {
return;
}
......@@ -96,44 +197,59 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
"should be equal with ParamOut's dims. But received Grad's "
"height [%s] and ParamOut's dims [%s]",
grad->height(), out_dims[0]));
auto &grad_value = grad->value();
const T *param_data = param->data<T>();
const T *grad_data = grad_value.data<T>();
const T *lr = learning_rate->data<T>();
const int64_t *rows_data = grad_rows.data();
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
auto &grad_rows = grad->rows();
const auto param_height = param_out->dims()[0];
const auto param_width = param_out->numel() / param_height;
// note: it is not grad->height()
const auto grad_height = static_cast<int64_t>(grad_rows.size());
const auto grad_width = grad_value.numel() / grad_height;
jit::sgd_attr_t attr;
attr.param_height = out_dims[0];
attr.param_width = param_out->numel() / attr.param_height;
attr.grad_height = grad_rows.size(); // note: it is not grad->height()
attr.grad_width = grad_value.numel() / attr.grad_height;
attr.selected_rows_size = grad_rows.size();
PADDLE_ENFORCE_EQ(
attr.grad_width, attr.param_width,
grad_width, param_width,
platform::errors::InvalidArgument(
"The grad_value's numel of SgdOp "
"should be equal with param_out's numel. But received "
"grad_value's numel [%s] and param_out's numel [%s]",
attr.grad_width, attr.param_width));
grad_width, param_width));
auto sgd =
jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
sgd_dense_param_kernel<
T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
} else {
PADDLE_ENFORCE_EQ(
false, true,
platform::errors::PermissionDenied(
false, true, platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad in SgdOp. Excepted "
"LodTensor or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}
} // namespace detail
template <typename DeviceContext, typename T>
class SGDOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override;
};
template <typename T>
class SGDOpKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param_var = ctx.InputVar("Param");
const auto *grad_var = ctx.InputVar("Grad");
if (param_var->IsType<framework::LoDTensor>()) {
detail::sgd_op_invoke_dense_param_kernel<T>(ctx);
} else if (param_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
platform::errors::InvalidArgument(
"when param is SelectedRows, "
"gradient should also be SelectedRows"));
"When param is SelectedRows, gradient should also "
"be SelectedRows"));
const auto &param = param_var->Get<framework::SelectedRows>();
auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
const auto &grad = grad_var->Get<framework::SelectedRows>();
......@@ -179,5 +295,6 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
}
}
};
} // namespace operators
} // namespace paddle
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.tests.unittests.op_test import (
OpTest, convert_float_to_uint16, convert_uint16_to_float)
import paddle
@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSGDOpBF16(OpTest):
def setUp(self):
self.op_type = 'sgd'
self.dtype = np.uint16
self.conf()
w = np.random.random((self.h, self.w)).astype('float32')
w_bf16 = convert_float_to_uint16(w)
g = np.random.random((self.h, self.w)).astype('float32')
g_bf16 = convert_float_to_uint16(g)
lr = np.array([0.1]).astype('float32')
lr_bf16 = convert_float_to_uint16(lr)
self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16}
self.outputs = {'ParamOut': w - lr * g}
def conf(self):
self.h = 102
self.w = 105
def test_check_output(self):
self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSGDOpCase8XBF16(TestSGDOpBF16):
def conf(self):
self.h = 10
self.w = 64
class TestSparseSGDOpBF16(unittest.TestCase):
@classmethod
def setUpClass(cls):
np.random.seed(12345)
def ref_optimize(self, params, grad_rows, grad_array, lr_value):
reference = np.copy(params)
for index, id in enumerate(grad_rows):
reference[id] = params[id] - lr_value * grad_array[index]
return reference
def check_output(self, actual_bf16, reference, atol=0, rtol=0.15e-2):
actual_fp32 = convert_uint16_to_float(actual_bf16)
np.testing.assert_allclose(actual_fp32, reference, atol=atol, rtol=rtol)
def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
# grad_array = np.random.random((len(rows), row_numel)).astype('float32')
grad_array = np.full((len(rows), row_numel), 2, np.float32)
np_array_bf16 = convert_float_to_uint16(grad_array)
grad_tensor = grad_selected_rows.get_tensor()
grad_tensor.set(np_array_bf16, place)
return grad_tensor, grad_array
def create_dense_param_var(self, scope, place, height, width):
param_tensor = scope.var('Param').get_tensor()
# param_array = np.random.random((height, width)).astype('float32')
param_array = np.full((height, width), 5, np.float32)
param_array_bf16 = convert_float_to_uint16(param_array)
param_tensor.set(param_array_bf16, place)
return param_tensor, param_array
def create_sparse_param_var(self, scope, place, height, rows, row_numel):
param_selected_rows = scope.var('Param').get_selected_rows()
param_selected_rows.set_height(height)
param_selected_rows.set_rows(rows)
param_selected_rows.sync_index()
param_array = np.random.random((len(rows), row_numel)).astype('float32')
np_array_bf16 = convert_float_to_uint16(param_array)
param_tensor = param_selected_rows.get_tensor()
param_tensor.set(np_array_bf16, place)
return param_tensor, param_array
def create_dense_lr_var(self, scope, place):
lr_tensor = scope.var('LearningRate').get_tensor()
# lr_value = np.random.uniform()
lr_value = 2
lr_array = np.full((1), lr_value, np.float32)
lr_array_bf16 = convert_float_to_uint16(lr_array)
lr_tensor.set(lr_array_bf16, place)
return lr_tensor, lr_value
@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSparseGradSGDOpBF16(TestSparseSGDOpBF16):
def setUp(self):
self.setup_params()
def setup_params(self):
self.grad_height = 10
self.grad_rows = [0, 4, 7]
self.grad_row_numel = 12
def test_sparse_grad_sgd(self):
scope = core.Scope()
place = core.CPUPlace()
_, grad_array = self.create_sparse_grad_var(
scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
param_tensor, param_array = self.create_dense_param_var(
scope, place, self.grad_height, self.grad_row_numel)
_, lr_value = self.create_dense_lr_var(scope, place)
sgd_op = Operator(
'sgd',
Param='Param',
Grad='Grad',
ParamOut='Param',
LearningRate='LearningRate')
sgd_op.run(scope, place)
reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
lr_value)
output = np.array(param_tensor)
self.check_output(output, reference, atol=5e-3, rtol=1e-1)
@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16):
def setup_params(self):
self.grad_height = 14
self.grad_rows = [1, 4, 12, 7, 8]
self.grad_row_numel = 16
@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
def setUp(self):
self.setup_params()
def setup_params(self):
self.grad_height = 10
self.grad_rows = [0, 4, 7]
self.grad_row_numel = 12
self.param_rows = [a for a in range(self.grad_height)]
def test_sparse_param_grad_sgd(self):
scope = core.Scope()
place = core.CPUPlace()
_, grad_array = self.create_sparse_grad_var(
scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
param_tensor, param_array = self.create_sparse_param_var(
scope, place, self.grad_height, self.param_rows,
self.grad_row_numel)
_, lr_value = self.create_dense_lr_var(scope, place)
sgd_op = Operator(
'sgd',
Param='Param',
Grad='Grad',
ParamOut='Param',
LearningRate='LearningRate')
sgd_op.run(scope, place)
reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
lr_value)
output = np.array(param_tensor)
self.check_output(output, reference, atol=5e-3, rtol=1e-1)
@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
def setup_params(self):
self.grad_height = 14
self.grad_rows = [1, 4, 12, 7, 8]
self.grad_row_numel = 16
self.param_rows = [a for a in range(self.grad_height)]
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -701,4 +701,5 @@ STATIC_MODE_TESTING_LIST = [
'test_generate_proposals_v2_op',
'test_lamb_op_xpu',
'test_model_cast_to_bf16',
'test_sgd_op_bf16',
]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册