From 3ac6c189a3ec5189c6d154945dee72c607da904a Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Wed, 14 Apr 2021 11:25:32 +0200 Subject: [PATCH] adds new CPU kernel for SGD op supporting BF16 data type (#32162) * Initial draft for SGD BG16 kernel. * Unit tests for SGD with BF16 data type. * Add VLOG message to SGD BF16 op CPU kernel. * Enhance error messages and error types. * Refactor SGD op kernels to leverage some common code. * Make easier to add new kerne invoke code. * Fix SGD op kernel for sparse grad. * Unify quotes style. * Fix error for ROCM compilation. * Use specialized PADDLE_ENFORCE_xx functions. --- paddle/fluid/operators/optimizers/sgd_op.cc | 6 +- paddle/fluid/operators/optimizers/sgd_op.h | 303 ++++++++++++------ .../fluid/tests/unittests/test_sgd_op_bf16.py | 212 ++++++++++++ tools/static_mode_white_list.py | 1 + 4 files changed, 428 insertions(+), 94 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index 569dbcd6a3..9603411ec4 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/sgd_op.h" #include + +#include "paddle/fluid/operators/optimizers/sgd_op.h" + namespace paddle { namespace operators { @@ -127,4 +129,6 @@ REGISTER_OPERATOR( ops::SGDOpInferVarType); REGISTER_OP_CPU_KERNEL( sgd, ops::SGDOpKernel, + ops::SGDOpKernel, ops::SGDOpKernel); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 1aaf95efc3..076121c0e2 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -13,14 +13,220 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/platform/bfloat16.h" namespace paddle { namespace operators { +namespace detail { + +template +struct sgd_dense_param_kernel { + void operator()() const {} +}; + +// LodTensor +template +struct sgd_dense_param_kernel< + T, framework::VarTypeTrait::kId> { + void operator()(const framework::ExecutionContext &ctx) const { + VLOG(4) << "[CPU]: sgd_dense_param_kernel"; + const auto *learning_rate = ctx.Input("LearningRate"); + const auto *param = ctx.Input("Param"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad = ctx.Input("Grad"); + + const auto sz = param_out->numel(); + jit::sgd_attr_t attr(1, sz, 1, sz, 1); + const T *lr = learning_rate->data(); + const T *param_data = param->data(); + const T *grad_data = grad->data(); + int64_t rows_idx = 0; + T *out_data = param_out->mutable_data(ctx.GetPlace()); + + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); + } +}; + +// SelectedRows +template +struct sgd_dense_param_kernel< + T, framework::VarTypeTrait::kId> { + void operator()(const framework::ExecutionContext &ctx) const { + VLOG(4) << "[CPU]: sgd_dense_param_kernel"; + const auto *learning_rate = ctx.Input("LearningRate"); + const auto *param = ctx.Input("Param"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad = ctx.Input("Grad"); + + const auto &grad_value = grad->value(); + const auto &grad_rows = grad->rows(); + const T *param_data = param->data(); + const T *grad_data = grad_value.data(); + const T *lr = learning_rate->data(); + const int64_t *rows_data = grad_rows.data(); + T *out_data = param_out->mutable_data(ctx.GetPlace()); + + jit::sgd_attr_t attr; + attr.param_height = param_out->dims()[0]; + attr.param_width = param_out->numel() / attr.param_height; + attr.grad_height = grad_rows.size(); // note: it is not grad->height() + attr.grad_width = grad_value.numel() / attr.grad_height; + attr.selected_rows_size = grad_rows.size(); + + auto sgd = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + sgd(lr, param_data, grad_data, rows_data, out_data, &attr); + } +}; + +// LodTensor +template <> +struct sgd_dense_param_kernel< + platform::bfloat16, framework::VarTypeTrait::kId> { + void operator()(const framework::ExecutionContext &ctx) const { + VLOG(4) << "[CPU]: sgd_dense_param_kernel"; + const auto *learning_rate = ctx.Input("LearningRate"); + const auto *param = ctx.Input("Param"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad = ctx.Input("Grad"); + param_out->mutable_data(ctx.GetPlace()); + + auto p = framework::EigenVector::Flatten(*param); + auto g = framework::EigenVector::Flatten(*grad); + auto o = framework::EigenVector::Flatten(*param_out); + const auto *lr = learning_rate->data(); + + o = p - lr[0] * g; + } +}; + +// SelectedRows +template <> +struct sgd_dense_param_kernel< + platform::bfloat16, framework::VarTypeTrait::kId> { + void operator()(const framework::ExecutionContext &ctx) const { + VLOG(4) << "[CPU]: sgd_dense_param_kernel"; + const auto *learning_rate = ctx.Input("LearningRate"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad = ctx.Input("Grad"); + + const auto &grad_value = grad->value(); + const auto &grad_rows = grad->rows(); + const auto grad_height = grad->height(); + const int64_t grad_val_height = static_cast(grad_rows.size()); + const auto grad_width = grad_value.numel() / grad_val_height; + + const auto *grad_data = grad_value.data(); + auto *out_data = param_out->data(); + const auto *lr = learning_rate->data(); + + for (size_t i = 0; i < grad_rows.size(); ++i) { + PADDLE_ENFORCE_LT( + grad_rows[i], grad_height, + platform::errors::OutOfRange( + "Grad rows index value should be less than grad height." + "Got [%s], but expected less than [%s]", + grad_rows[i], grad_height)); + const int64_t row = grad_rows[i]; + for (int64_t j = 0; j < grad_width; ++j) { + out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j]; + } + } + } +}; + +template +void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) { + const auto *param = ctx.Input("Param"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad_var = ctx.InputVar("Grad"); + + if (grad_var->IsType()) { + const auto *grad = ctx.Input("Grad"); + const auto sz = param_out->numel(); + PADDLE_ENFORCE_EQ(param->numel(), sz, + platform::errors::InvalidArgument( + "The input tensor Param's numel of SgdOp " + "should be equal with ParamOut's numel. " + "But received Param's " + "numel = [%s], ParamOut's numel = [%s]", + param->numel(), sz)); + PADDLE_ENFORCE_EQ(grad->numel(), sz, + platform::errors::InvalidArgument( + "The input tensor Grad's numel of SgdOp " + "should be equal with ParamOut's numel. " + "But received Grad's " + "numel = [%s], ParamOut's numel = [%s]", + grad->numel(), sz)); + + sgd_dense_param_kernel< + T, framework::VarTypeTrait::kId>()(ctx); + } else if (grad_var->IsType()) { + // TODO(qijun): In Sparse SGD operator, in-place update is enforced. + // This manual optimization brings difficulty to track data dependency. + // It's better to find a more elegant solution. + PADDLE_ENFORCE_EQ(param, param_out, + platform::errors::InvalidArgument( + "The input tensor Param of SgdOp " + "should be equal with ParamOut if variable's " + "type is SelectedRows. ")); + const auto *grad = ctx.Input("Grad"); + + // for distributed training, a sparse var may be empty, + // just skip updating. + if (grad->rows().size() == 0) { + return; + } + + auto out_dims = param_out->dims(); + PADDLE_ENFORCE_EQ( + grad->height(), out_dims[0], + platform::errors::InvalidArgument( + "The input tensor Grad's height of SgdOp " + "should be equal with ParamOut's dims. But received Grad's " + "height [%s] and ParamOut's dims [%s]", + grad->height(), out_dims[0])); + + auto &grad_value = grad->value(); + auto &grad_rows = grad->rows(); + const auto param_height = param_out->dims()[0]; + const auto param_width = param_out->numel() / param_height; + // note: it is not grad->height() + const auto grad_height = static_cast(grad_rows.size()); + const auto grad_width = grad_value.numel() / grad_height; + + PADDLE_ENFORCE_EQ( + grad_width, param_width, + platform::errors::InvalidArgument( + "The grad_value's numel of SgdOp " + "should be equal with param_out's numel. But received " + "grad_value's numel [%s] and param_out's numel [%s]", + grad_width, param_width)); + + sgd_dense_param_kernel< + T, framework::VarTypeTrait::kId>()(ctx); + } else { + PADDLE_ENFORCE_EQ( + false, true, platform::errors::PermissionDenied( + "Unsupported Variable Type of Grad in SgdOp. Excepted " + "LodTensor or SelectedRows, But received [%s]", + paddle::framework::ToTypeName(grad_var->Type()))); + } +} + +} // namespace detail + template class SGDOpKernel : public framework::OpKernel { public: @@ -38,102 +244,12 @@ class SGDOpKernel const auto *grad_var = ctx.InputVar("Grad"); if (param_var->IsType()) { - const auto *param = ctx.Input("Param"); - auto *param_out = ctx.Output("ParamOut"); - // Actually, all tensors are LoDTensor except SelectedRows. - if (grad_var->IsType()) { - const auto *grad = ctx.Input("Grad"); - auto sz = param_out->numel(); - PADDLE_ENFORCE_EQ(param->numel(), sz, - platform::errors::InvalidArgument( - "The input tensor Param's numel of SgdOp " - "should be equal with ParamOut's numel. " - "But received Param's " - "numel = [%s], ParamOut's numel = [%s]", - param->numel(), sz)); - PADDLE_ENFORCE_EQ(grad->numel(), sz, - platform::errors::InvalidArgument( - "The input tensor Grad's numel of SgdOp " - "should be equal with ParamOut's numel. " - "But received Grad's " - "numel = [%s], ParamOut's numel = [%s]", - grad->numel(), sz)); - - jit::sgd_attr_t attr(1, sz, 1, sz, 1); - const T *lr = learning_rate->data(); - const T *param_data = param->data(); - const T *grad_data = grad->data(); - int64_t rows_idx = 0; - T *out_data = param_out->mutable_data(ctx.GetPlace()); - - auto sgd = - jit::KernelFuncs, platform::CPUPlace>::Cache().At( - attr); - sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); - } else if (grad_var->IsType()) { - // TODO(qijun): In Sparse SGD operator, in-place update is enforced. - // This manual optimization brings difficulty to track data dependency. - // It's better to find a more elegant solution. - PADDLE_ENFORCE_EQ(param, param_out, - platform::errors::InvalidArgument( - "The input tensor Param of SgdOp " - "should be equal with ParamOut if variable's " - "type is SelectedRows. ")); - const auto *grad = ctx.Input("Grad"); - auto &grad_rows = grad->rows(); - - // for distributed training, a sparse var may be empty, - // just skip updating. - if (grad_rows.size() == 0) { - return; - } - - auto out_dims = param_out->dims(); - PADDLE_ENFORCE_EQ( - grad->height(), out_dims[0], - platform::errors::InvalidArgument( - "The input tensor Grad's height of SgdOp " - "should be equal with ParamOut's dims. But received Grad's " - "height [%s] and ParamOut's dims [%s]", - grad->height(), out_dims[0])); - auto &grad_value = grad->value(); - const T *param_data = param->data(); - const T *grad_data = grad_value.data(); - const T *lr = learning_rate->data(); - const int64_t *rows_data = grad_rows.data(); - T *out_data = param_out->mutable_data(ctx.GetPlace()); - - jit::sgd_attr_t attr; - attr.param_height = out_dims[0]; - attr.param_width = param_out->numel() / attr.param_height; - attr.grad_height = grad_rows.size(); // note: it is not grad->height() - attr.grad_width = grad_value.numel() / attr.grad_height; - attr.selected_rows_size = grad_rows.size(); - PADDLE_ENFORCE_EQ( - attr.grad_width, attr.param_width, - platform::errors::InvalidArgument( - "The grad_value's numel of SgdOp " - "should be equal with param_out's numel. But received " - "grad_value's numel [%s] and param_out's numel [%s]", - attr.grad_width, attr.param_width)); - - auto sgd = - jit::KernelFuncs, platform::CPUPlace>::Cache().At( - attr); - sgd(lr, param_data, grad_data, rows_data, out_data, &attr); - } else { - PADDLE_ENFORCE_EQ( - false, true, - platform::errors::PermissionDenied( - "Unsupported Variable Type of Grad in SgdOp. Excepted " - "LodTensor or SelectedRows, But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } + detail::sgd_op_invoke_dense_param_kernel(ctx); } else if (param_var->IsType()) { PADDLE_ENFORCE_EQ(grad_var->IsType(), true, platform::errors::InvalidArgument( - "when param is SelectedRows, " - "gradient should also be SelectedRows")); + "When param is SelectedRows, gradient should also " + "be SelectedRows")); const auto ¶m = param_var->Get(); auto *param_out = ctx.Output("ParamOut"); const auto &grad = grad_var->Get(); @@ -179,5 +295,6 @@ class SGDOpKernel } } }; + } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py new file mode 100644 index 0000000000..0717ec80f6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py @@ -0,0 +1,212 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.tests.unittests.op_test import ( + OpTest, convert_float_to_uint16, convert_uint16_to_float) +import paddle + + +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSGDOpBF16(OpTest): + def setUp(self): + self.op_type = 'sgd' + self.dtype = np.uint16 + self.conf() + w = np.random.random((self.h, self.w)).astype('float32') + w_bf16 = convert_float_to_uint16(w) + g = np.random.random((self.h, self.w)).astype('float32') + g_bf16 = convert_float_to_uint16(g) + lr = np.array([0.1]).astype('float32') + lr_bf16 = convert_float_to_uint16(lr) + + self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16} + self.outputs = {'ParamOut': w - lr * g} + + def conf(self): + self.h = 102 + self.w = 105 + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), check_dygraph=False) + + +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSGDOpCase8XBF16(TestSGDOpBF16): + def conf(self): + self.h = 10 + self.w = 64 + + +class TestSparseSGDOpBF16(unittest.TestCase): + @classmethod + def setUpClass(cls): + np.random.seed(12345) + + def ref_optimize(self, params, grad_rows, grad_array, lr_value): + reference = np.copy(params) + for index, id in enumerate(grad_rows): + reference[id] = params[id] - lr_value * grad_array[index] + return reference + + def check_output(self, actual_bf16, reference, atol=0, rtol=0.15e-2): + actual_fp32 = convert_uint16_to_float(actual_bf16) + np.testing.assert_allclose(actual_fp32, reference, atol=atol, rtol=rtol) + + def create_sparse_grad_var(self, scope, place, height, rows, row_numel): + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + # grad_array = np.random.random((len(rows), row_numel)).astype('float32') + grad_array = np.full((len(rows), row_numel), 2, np.float32) + np_array_bf16 = convert_float_to_uint16(grad_array) + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array_bf16, place) + + return grad_tensor, grad_array + + def create_dense_param_var(self, scope, place, height, width): + param_tensor = scope.var('Param').get_tensor() + # param_array = np.random.random((height, width)).astype('float32') + param_array = np.full((height, width), 5, np.float32) + param_array_bf16 = convert_float_to_uint16(param_array) + param_tensor.set(param_array_bf16, place) + + return param_tensor, param_array + + def create_sparse_param_var(self, scope, place, height, rows, row_numel): + param_selected_rows = scope.var('Param').get_selected_rows() + param_selected_rows.set_height(height) + param_selected_rows.set_rows(rows) + param_selected_rows.sync_index() + param_array = np.random.random((len(rows), row_numel)).astype('float32') + np_array_bf16 = convert_float_to_uint16(param_array) + + param_tensor = param_selected_rows.get_tensor() + param_tensor.set(np_array_bf16, place) + + return param_tensor, param_array + + def create_dense_lr_var(self, scope, place): + lr_tensor = scope.var('LearningRate').get_tensor() + # lr_value = np.random.uniform() + lr_value = 2 + lr_array = np.full((1), lr_value, np.float32) + lr_array_bf16 = convert_float_to_uint16(lr_array) + lr_tensor.set(lr_array_bf16, place) + + return lr_tensor, lr_value + + +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSparseGradSGDOpBF16(TestSparseSGDOpBF16): + def setUp(self): + self.setup_params() + + def setup_params(self): + self.grad_height = 10 + self.grad_rows = [0, 4, 7] + self.grad_row_numel = 12 + + def test_sparse_grad_sgd(self): + scope = core.Scope() + place = core.CPUPlace() + _, grad_array = self.create_sparse_grad_var( + scope, place, self.grad_height, self.grad_rows, self.grad_row_numel) + param_tensor, param_array = self.create_dense_param_var( + scope, place, self.grad_height, self.grad_row_numel) + _, lr_value = self.create_dense_lr_var(scope, place) + + sgd_op = Operator( + 'sgd', + Param='Param', + Grad='Grad', + ParamOut='Param', + LearningRate='LearningRate') + sgd_op.run(scope, place) + + reference = self.ref_optimize(param_array, self.grad_rows, grad_array, + lr_value) + output = np.array(param_tensor) + self.check_output(output, reference, atol=5e-3, rtol=1e-1) + + +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16): + def setup_params(self): + self.grad_height = 14 + self.grad_rows = [1, 4, 12, 7, 8] + self.grad_row_numel = 16 + + +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16): + def setUp(self): + self.setup_params() + + def setup_params(self): + self.grad_height = 10 + self.grad_rows = [0, 4, 7] + self.grad_row_numel = 12 + self.param_rows = [a for a in range(self.grad_height)] + + def test_sparse_param_grad_sgd(self): + scope = core.Scope() + place = core.CPUPlace() + _, grad_array = self.create_sparse_grad_var( + scope, place, self.grad_height, self.grad_rows, self.grad_row_numel) + param_tensor, param_array = self.create_sparse_param_var( + scope, place, self.grad_height, self.param_rows, + self.grad_row_numel) + _, lr_value = self.create_dense_lr_var(scope, place) + + sgd_op = Operator( + 'sgd', + Param='Param', + Grad='Grad', + ParamOut='Param', + LearningRate='LearningRate') + sgd_op.run(scope, place) + + reference = self.ref_optimize(param_array, self.grad_rows, grad_array, + lr_value) + output = np.array(param_tensor) + self.check_output(output, reference, atol=5e-3, rtol=1e-1) + + +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16): + def setup_params(self): + self.grad_height = 14 + self.grad_rows = [1, 4, 12, 7, 8] + self.grad_row_numel = 16 + self.param_rows = [a for a in range(self.grad_height)] + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index ab5b6516b9..5bb4c8a630 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -701,4 +701,5 @@ STATIC_MODE_TESTING_LIST = [ 'test_generate_proposals_v2_op', 'test_lamb_op_xpu', 'test_model_cast_to_bf16', + 'test_sgd_op_bf16', ] -- GitLab