From 943dedec4c85e611780cc552783fa313d0ea4e95 Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 1 Mar 2022 13:15:25 +0000 Subject: [PATCH] add sgd kernel; test=develop --- paddle/fluid/framework/operator.cc | 6 +- paddle/phi/core/kernel_registry.h | 6 + paddle/phi/kernels/gpu/sgd_kernel.cu | 2 +- paddle/phi/ops/compat/sgd_sig.cc | 2 - .../fluid/tests/unittests/test_sgd_op.py | 728 +++++++++--------- 5 files changed, 380 insertions(+), 364 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b12ad552ab..e03a969c68 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2048,7 +2048,11 @@ void OperatorWithKernel::BuildPhiKernelContext( // deal with optional here if ((it == ctx.inputs.end() || it->second.size() == 0) && (input_defs[i].type_index == - std::type_index(typeid(paddle::optional)))) { + std::type_index( + typeid(paddle::optional)) || + input_defs[i].type_index == + std::type_index( + typeid(paddle::optional)))) { pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr); auto end_idx = start_idx + 1; pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 7a05452cbe..2b04d173af 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -81,6 +81,12 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid(const std::vector&))) { args_def->AppendInput(default_key.backend(), diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu index 74c377b559..7dd5a03383 100644 --- a/paddle/phi/kernels/gpu/sgd_kernel.cu +++ b/paddle/phi/kernels/gpu/sgd_kernel.cu @@ -14,6 +14,7 @@ #include "paddle/phi/kernels/sgd_kernel.h" +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_helper.h" @@ -72,7 +73,6 @@ void SGDDenseKernel(const Context& dev_ctx, bool multi_precision, DenseTensor* param_out, DenseTensor* master_param_out) { - LOG(ERROR) << "run here"; using MPDType = typename paddle::operators::details::MPTypeTrait::Type; // do check here // if (multi_precision) { diff --git a/paddle/phi/ops/compat/sgd_sig.cc b/paddle/phi/ops/compat/sgd_sig.cc index ac75cf1d5d..cdf1a221f7 100644 --- a/paddle/phi/ops/compat/sgd_sig.cc +++ b/paddle/phi/ops/compat/sgd_sig.cc @@ -17,9 +17,7 @@ namespace phi { KernelSignature SGDOpArgumentMapping(const ArgumentMappingContext& ctx) { - LOG(ERROR) << "11"; if (ctx.IsDenseTensorInput("Grad")) { - LOG(ERROR) << "dense"; return KernelSignature("sgd", {"Param", "LearningRate", "Grad", "MasterParam"}, {"multi_precision"}, diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py index e8ba53e018..817150a21f 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py @@ -24,366 +24,374 @@ import paddle paddle.enable_static() -# class TestSGDOp(OpTest): -# def setUp(self): -# self.op_type = "sgd" -# self.conf() -# w = np.random.random((self.h, self.w)).astype("float32") -# g = np.random.random((self.h, self.w)).astype("float32") -# lr = np.array([0.1]).astype("float32") - -# self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr} -# self.outputs = {'ParamOut': w - lr * g} - -# def conf(self): -# self.h = 102 -# self.w = 105 - -# def test_check_output(self): -# self.check_output() - -# class TestSGDOpCase8X(TestSGDOp): -# def conf(self): -# self.h = 10 -# self.w = 64 - -# class TestSparseSGDOp(unittest.TestCase): -# def check_with_place(self, place): -# scope = core.Scope() - -# # create and initialize Grad Variable -# height = 10 -# rows = [0, 4, 7] -# self.conf() - -# grad_selected_rows = scope.var('Grad').get_selected_rows() -# grad_selected_rows.set_height(height) -# grad_selected_rows.set_rows(rows) -# np_array = np.ones((len(rows), self.row_numel)).astype("float32") -# np_array[0, 0] = 2.0 -# np_array[2, 8] = 4.0 - -# grad_tensor = grad_selected_rows.get_tensor() -# grad_tensor.set(np_array, place) - -# # create and initialize Param Variable -# param = scope.var('Param').get_tensor() -# param_array = np.full((height, self.row_numel), 5.0).astype("float32") -# param.set(param_array, place) - -# # create and initialize LeraningRate Variable -# lr = scope.var('LearningRate').get_tensor() -# lr_array = np.full((1), 2.0).astype("float32") -# lr.set(lr_array, place) - -# # create and run sgd operator -# sgd_op = Operator( -# "sgd", -# Param='Param', -# Grad='Grad', -# ParamOut='Param', -# LearningRate='LearningRate') -# sgd_op.run(scope, place) - -# # get and compare result -# result_array = np.array(param) - -# # rows[0] = 0, 5.0 - 2.0 * 2.0 -# self.assertAlmostEqual(1.0, result_array[rows[0], 0]) -# # rows[0] = 0, 5.0 - 2.0 * 1.0 -# self.assertAlmostEqual(3.0, result_array[rows[0], 2]) -# # 5.0 - 2.0 * 0.0 -# self.assertAlmostEqual(5.0, result_array[1, 0]) -# # rows[1] = 4, 5.0 - 2.0 * 1.0 -# self.assertAlmostEqual(3.0, result_array[rows[1], 10]) -# # 5.0 - 2.0 * 0.0 -# self.assertAlmostEqual(5.0, result_array[5, 8]) -# # rows[2] = 7, 5.0 - 2.0 * 1.0 -# self.assertAlmostEqual(3.0, result_array[rows[2], 1]) -# # rows[2] = 7, 5.0 - 2.0 * 4.0 -# self.assertAlmostEqual(-3.0, result_array[rows[2], 8]) - -# def test_sparse_sgd(self): -# places = [core.CPUPlace()] -# if core.is_compiled_with_cuda(): -# places.append(core.CUDAPlace(0)) -# for place in places: -# self.check_with_place(place) - -# def conf(self): -# self.row_numel = 12 - -# class TestSparseSGDOpCase8X(TestSparseSGDOp): -# def conf(self): -# self.row_numel = 16 - -# class TestSGDOpOptimizeSelectedRows(unittest.TestCase): -# def check_with_place(self, place): -# scope = core.Scope() - -# row_width = 12 -# # create and initialize Grad Variable -# grad_height = 10 -# grad_rows = [0, 4, 7] - -# grad_selected_rows = scope.var('Grad').get_selected_rows() -# grad_selected_rows.set_height(grad_height) -# grad_selected_rows.set_rows(grad_rows) -# grad_array = np.ones((len(grad_rows), row_width)).astype("float32") -# grad_array[0, 0] = 2.0 -# grad_array[2, 8] = 4.0 - -# grad_tensor = grad_selected_rows.get_tensor() -# grad_tensor.set(grad_array, place) - -# # create and initialize Param Variable -# # create and initialize W Variable -# param_rows = [0, 1, 2, 3, 4, 5, 6, 7] - -# # init Param -# w_selected_rows = scope.var('Param').get_selected_rows() -# w_selected_rows.set_height(len(param_rows)) -# w_selected_rows.set_rows(param_rows) -# w_selected_rows.sync_index() -# w_array = np.ones((len(param_rows), row_width)).astype("float32") -# for i in range(len(param_rows)): -# w_array[i] *= i -# w_tensor = w_selected_rows.get_tensor() -# w_tensor.set(w_array, place) - -# w_before_optimize = np.array(w_tensor) - -# # create and initialize LeraningRate Variable -# lr_value = 0.1 -# lr = scope.var('LearningRate').get_tensor() -# lr_array = np.full((1), lr_value).astype("float32") -# lr.set(lr_array, place) - -# # optimize with Python -# w_after_optimize = np.copy(w_before_optimize) -# for index, id in enumerate(grad_rows): -# w_after_optimize[id] = w_before_optimize[ -# id] - lr_value * grad_array[index] - -# # create and run sgd operator -# sgd_op = Operator( -# "sgd", -# Param='Param', -# Grad='Grad', -# ParamOut='Param', -# LearningRate='LearningRate') -# sgd_op.run(scope, place) - -# # get and compare result -# result_array = np.array(w_tensor) -# assert (result_array == w_after_optimize).all() - -# def test_sparse_parameter_sgd(self): -# places = [core.CPUPlace()] -# # do not support GPU kernel currently -# for place in places: -# self.check_with_place(place) - -# class TestSGDOpWithLargeInput(unittest.TestCase): -# def runTest(self): -# paddle.enable_static() -# data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64') -# label = fluid.layers.fill_constant( -# shape=[1, 150], value=0.5, dtype='float32') -# emb = fluid.embedding(input=data, size=(10000000, 150), dtype='float32') -# out = fluid.layers.l2_normalize(x=emb, axis=-1) - -# cost = fluid.layers.square_error_cost(input=out, label=label) -# avg_cost = fluid.layers.mean(cost) -# sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) -# sgd_optimizer.minimize(avg_cost) - -# place = fluid.CPUPlace() -# exe = fluid.Executor(place) -# exe.run(fluid.default_startup_program()) -# compiled_prog = fluid.compiler.CompiledProgram( -# fluid.default_main_program()) -# result = exe.run(compiled_prog, fetch_list=[avg_cost]) - -# class TestSGDV2(unittest.TestCase): -# def test_sgd_dygraph(self): -# paddle.disable_static() -# value = np.arange(26).reshape(2, 13).astype("float32") -# a = paddle.to_tensor(value) -# linear = paddle.nn.Linear(13, 5) -# # This can be any optimizer supported by dygraph. -# adam = paddle.optimizer.SGD(learning_rate=0.01, -# parameters=linear.parameters(), -# weight_decay=0.01) -# out = linear(a) -# out.backward() -# adam.step() -# adam.clear_gradients() - -# def test_sgd(self): -# paddle.enable_static() - -# def check_sgd_optimizer(optimizer_attr): -# init_program = paddle.static.Program() -# program = paddle.static.Program() -# block = program.global_block() -# mul_x = block.create_parameter( -# dtype="float32", -# shape=[5, 10], -# lod_level=0, -# name="mul.x", -# optimize_attr=optimizer_attr) -# mul_y = block.create_var( -# dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") -# mul_out = block.create_var( -# dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") -# mean_out = block.create_var( -# dtype="float32", shape=[1], lod_level=0, name="mean.out") -# block.append_op( -# type="mul", -# inputs={"X": mul_x, -# "Y": mul_y}, -# outputs={"Out": mul_out}, -# attrs={"x_num_col_dims": 1}) -# block.append_op( -# type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) -# sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.01) -# opts, _ = sgd_optimizer.minimize(mean_out, init_program) -# return opts - -# opts = check_sgd_optimizer({'learning_rate': 1.1}) -# self.assertEqual(len(opts), 2) -# self.assertEqual([op.type for op in opts], ["scale", "sgd"]) - -# opts = check_sgd_optimizer({'learning_rate': 1.0}) -# self.assertEqual(len(opts), 1) -# self.assertEqual([op.type for op in opts], ["sgd"]) - -# def test_raise_error(self): -# self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None) - -# def test_sgd_group_dygraph(self): -# paddle.disable_static() -# value = np.arange(26).reshape(2, 13).astype("float32") -# a = paddle.to_tensor(value) -# linear_1 = paddle.nn.Linear(13, 5) -# linear_2 = paddle.nn.Linear(5, 3) -# # This can be any optimizer supported by dygraph. -# adam = paddle.optimizer.SGD(learning_rate=0.01, -# parameters=[{ -# 'params': linear_1.parameters() -# }, { -# 'params': linear_2.parameters(), -# 'weight_decay': 0.001, -# 'learning_rate': 0.1 -# }], -# weight_decay=0.01) -# out = linear_1(a) -# out = linear_2(out) -# out.backward() -# adam.step() -# adam.clear_gradients() - -# class TestSGDMultiPrecision2_0(unittest.TestCase): -# def dygraph_sgd_mp(self, mp): -# paddle.disable_static() -# paddle.seed(10) -# paddle.set_device('gpu') -# input = paddle.randn((2, 2)) -# model = paddle.nn.Linear(2, 2) -# optimizer = paddle.optimizer.SGD(parameters=model.parameters(), -# multi_precision=mp) -# if mp == True: -# model = paddle.amp.decorate(models=model, level='O2') -# scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - -# for idx in range(5): -# if mp == True: -# with paddle.amp.auto_cast(level='O2'): -# output = model(input) -# loss = paddle.mean(output) -# scaled = scaler.scale(loss) -# scaled.backward() -# scaler.minimize(optimizer, scaled) -# optimizer.clear_grad() -# else: -# output = model(input) -# loss = paddle.mean(output) -# optimizer.step() -# optimizer.clear_grad() - -# return output, model.parameters() - -# def static_sgd_mp(self, mp): -# paddle.enable_static() -# paddle.seed(10) -# np.random.seed(10) -# exe = paddle.static.Executor('gpu') -# train_program = paddle.static.Program() -# startup_program = paddle.static.Program() -# optimizer = paddle.optimizer.SGD(multi_precision=mp) - -# if mp: -# optimizer = paddle.static.amp.decorate( -# optimizer, -# init_loss_scaling=128.0, -# use_dynamic_loss_scaling=True, -# use_pure_fp16=True, -# use_fp16_guard=False) -# with paddle.static.program_guard(train_program, startup_program): -# if mp: -# data = paddle.static.data( -# shape=[2, 2], name='X', dtype='float16') -# else: -# data = paddle.static.data( -# shape=[2, 2], name='X', dtype='float32') -# hidden = paddle.static.nn.fc(x=data, size=10) -# loss = paddle.fluid.layers.mean(hidden) -# optimizer.minimize(loss) -# exe.run(startup_program) - -# if mp: -# optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) -# x = np.random.random(size=(2, 2)).astype('float16') -# else: -# x = np.random.random(size=(2, 2)).astype('float32') -# out = [] -# for idx in range(5): -# loss_data, = exe.run(train_program, -# feed={"X": x}, -# fetch_list=[loss.name]) -# out.append(loss_data) -# return out - -# def test_main(self): -# if not paddle.is_compiled_with_cuda(): -# return -# "Test dygraph mode" -# output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True) -# output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False) -# self.assertEqual( -# np.allclose( -# output1_dy.astype('float32').numpy(), -# output2_dy.astype('float32').numpy(), -# atol=1e-01), -# True) -# for idx in range(len(params1_dy)): -# self.assertEqual( -# np.allclose( -# params1_dy[idx].astype('float32').numpy(), -# params2_dy[idx].astype('float32').numpy(), -# atol=1e-01), -# True) -# "Test static mode" -# output1_st = self.static_sgd_mp(mp=True) -# output2_st = self.static_sgd_mp(mp=False) -# for idx in range(len(output1_st)): -# self.assertEqual( -# np.allclose( -# output1_st[idx].astype('float32'), -# output2_st[idx].astype('float32'), -# atol=1e-01), -# True) + +class TestSGDOp(OpTest): + def setUp(self): + self.op_type = "sgd" + self.conf() + w = np.random.random((self.h, self.w)).astype("float32") + g = np.random.random((self.h, self.w)).astype("float32") + lr = np.array([0.1]).astype("float32") + + self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr} + self.outputs = {'ParamOut': w - lr * g} + + def conf(self): + self.h = 102 + self.w = 105 + + def test_check_output(self): + self.check_output() + + +class TestSGDOpCase8X(TestSGDOp): + def conf(self): + self.h = 10 + self.w = 64 + + +class TestSparseSGDOp(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7] + self.conf() + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), self.row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array, place) + + # create and initialize Param Variable + param = scope.var('Param').get_tensor() + param_array = np.full((height, self.row_numel), 5.0).astype("float32") + param.set(param_array, place) + + # create and initialize LeraningRate Variable + lr = scope.var('LearningRate').get_tensor() + lr_array = np.full((1), 2.0).astype("float32") + lr.set(lr_array, place) + + # create and run sgd operator + sgd_op = Operator( + "sgd", + Param='Param', + Grad='Grad', + ParamOut='Param', + LearningRate='LearningRate') + sgd_op.run(scope, place) + + # get and compare result + result_array = np.array(param) + + # rows[0] = 0, 5.0 - 2.0 * 2.0 + self.assertAlmostEqual(1.0, result_array[rows[0], 0]) + # rows[0] = 0, 5.0 - 2.0 * 1.0 + self.assertAlmostEqual(3.0, result_array[rows[0], 2]) + # 5.0 - 2.0 * 0.0 + self.assertAlmostEqual(5.0, result_array[1, 0]) + # rows[1] = 4, 5.0 - 2.0 * 1.0 + self.assertAlmostEqual(3.0, result_array[rows[1], 10]) + # 5.0 - 2.0 * 0.0 + self.assertAlmostEqual(5.0, result_array[5, 8]) + # rows[2] = 7, 5.0 - 2.0 * 1.0 + self.assertAlmostEqual(3.0, result_array[rows[2], 1]) + # rows[2] = 7, 5.0 - 2.0 * 4.0 + self.assertAlmostEqual(-3.0, result_array[rows[2], 8]) + + def test_sparse_sgd(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + def conf(self): + self.row_numel = 12 + + +class TestSparseSGDOpCase8X(TestSparseSGDOp): + def conf(self): + self.row_numel = 16 + + +class TestSGDOpOptimizeSelectedRows(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + + row_width = 12 + # create and initialize Grad Variable + grad_height = 10 + grad_rows = [0, 4, 7] + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(grad_height) + grad_selected_rows.set_rows(grad_rows) + grad_array = np.ones((len(grad_rows), row_width)).astype("float32") + grad_array[0, 0] = 2.0 + grad_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(grad_array, place) + + # create and initialize Param Variable + # create and initialize W Variable + param_rows = [0, 1, 2, 3, 4, 5, 6, 7] + + # init Param + w_selected_rows = scope.var('Param').get_selected_rows() + w_selected_rows.set_height(len(param_rows)) + w_selected_rows.set_rows(param_rows) + w_selected_rows.sync_index() + w_array = np.ones((len(param_rows), row_width)).astype("float32") + for i in range(len(param_rows)): + w_array[i] *= i + w_tensor = w_selected_rows.get_tensor() + w_tensor.set(w_array, place) + + w_before_optimize = np.array(w_tensor) + + # create and initialize LeraningRate Variable + lr_value = 0.1 + lr = scope.var('LearningRate').get_tensor() + lr_array = np.full((1), lr_value).astype("float32") + lr.set(lr_array, place) + + # optimize with Python + w_after_optimize = np.copy(w_before_optimize) + for index, id in enumerate(grad_rows): + w_after_optimize[id] = w_before_optimize[ + id] - lr_value * grad_array[index] + + # create and run sgd operator + sgd_op = Operator( + "sgd", + Param='Param', + Grad='Grad', + ParamOut='Param', + LearningRate='LearningRate') + sgd_op.run(scope, place) + + # get and compare result + result_array = np.array(w_tensor) + assert (result_array == w_after_optimize).all() + + def test_sparse_parameter_sgd(self): + places = [core.CPUPlace()] + # do not support GPU kernel currently + for place in places: + self.check_with_place(place) + + +class TestSGDOpWithLargeInput(unittest.TestCase): + def runTest(self): + paddle.enable_static() + data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64') + label = fluid.layers.fill_constant( + shape=[1, 150], value=0.5, dtype='float32') + emb = fluid.embedding(input=data, size=(10000000, 150), dtype='float32') + out = fluid.layers.l2_normalize(x=emb, axis=-1) + + cost = fluid.layers.square_error_cost(input=out, label=label) + avg_cost = fluid.layers.mean(cost) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + compiled_prog = fluid.compiler.CompiledProgram( + fluid.default_main_program()) + result = exe.run(compiled_prog, fetch_list=[avg_cost]) + + +class TestSGDV2(unittest.TestCase): + def test_sgd_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5) + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.SGD(learning_rate=0.01, + parameters=linear.parameters(), + weight_decay=0.01) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_sgd(self): + paddle.enable_static() + + def check_sgd_optimizer(optimizer_attr): + init_program = paddle.static.Program() + program = paddle.static.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr=optimizer_attr) + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mean_out = block.create_var( + dtype="float32", shape=[1], lod_level=0, name="mean.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + block.append_op( + type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.01) + opts, _ = sgd_optimizer.minimize(mean_out, init_program) + return opts + + opts = check_sgd_optimizer({'learning_rate': 1.1}) + self.assertEqual(len(opts), 2) + self.assertEqual([op.type for op in opts], ["scale", "sgd"]) + + opts = check_sgd_optimizer({'learning_rate': 1.0}) + self.assertEqual(len(opts), 1) + self.assertEqual([op.type for op in opts], ["sgd"]) + + def test_raise_error(self): + self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None) + + def test_sgd_group_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear_1 = paddle.nn.Linear(13, 5) + linear_2 = paddle.nn.Linear(5, 3) + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.SGD(learning_rate=0.01, + parameters=[{ + 'params': linear_1.parameters() + }, { + 'params': linear_2.parameters(), + 'weight_decay': 0.001, + 'learning_rate': 0.1 + }], + weight_decay=0.01) + out = linear_1(a) + out = linear_2(out) + out.backward() + adam.step() + adam.clear_gradients() + + +class TestSGDMultiPrecision2_0(unittest.TestCase): + def dygraph_sgd_mp(self, mp): + paddle.disable_static() + paddle.seed(10) + paddle.set_device('gpu') + input = paddle.randn((2, 2)) + model = paddle.nn.Linear(2, 2) + optimizer = paddle.optimizer.SGD(parameters=model.parameters(), + multi_precision=mp) + if mp == True: + model = paddle.amp.decorate(models=model, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + for idx in range(5): + if mp == True: + with paddle.amp.auto_cast(level='O2'): + output = model(input) + loss = paddle.mean(output) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: + output = model(input) + loss = paddle.mean(output) + optimizer.step() + optimizer.clear_grad() + + return output, model.parameters() + + def static_sgd_mp(self, mp): + paddle.enable_static() + paddle.seed(10) + np.random.seed(10) + exe = paddle.static.Executor('gpu') + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + optimizer = paddle.optimizer.SGD(multi_precision=mp) + + if mp: + optimizer = paddle.static.amp.decorate( + optimizer, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True, + use_fp16_guard=False) + with paddle.static.program_guard(train_program, startup_program): + if mp: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float16') + else: + data = paddle.static.data( + shape=[2, 2], name='X', dtype='float32') + hidden = paddle.static.nn.fc(x=data, size=10) + loss = paddle.fluid.layers.mean(hidden) + optimizer.minimize(loss) + exe.run(startup_program) + + if mp: + optimizer.amp_init(place='gpu', scope=paddle.static.global_scope()) + x = np.random.random(size=(2, 2)).astype('float16') + else: + x = np.random.random(size=(2, 2)).astype('float32') + out = [] + for idx in range(5): + loss_data, = exe.run(train_program, + feed={"X": x}, + fetch_list=[loss.name]) + out.append(loss_data) + return out + + def test_main(self): + if not paddle.is_compiled_with_cuda(): + return + "Test dygraph mode" + output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True) + output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False) + self.assertEqual( + np.allclose( + output1_dy.astype('float32').numpy(), + output2_dy.astype('float32').numpy(), + atol=1e-01), + True) + for idx in range(len(params1_dy)): + self.assertEqual( + np.allclose( + params1_dy[idx].astype('float32').numpy(), + params2_dy[idx].astype('float32').numpy(), + atol=1e-01), + True) + "Test static mode" + output1_st = self.static_sgd_mp(mp=True) + output2_st = self.static_sgd_mp(mp=False) + for idx in range(len(output1_st)): + self.assertEqual( + np.allclose( + output1_st[idx].astype('float32'), + output2_st[idx].astype('float32'), + atol=1e-01), + True) class TestSGDMultiPrecision1_0(unittest.TestCase): -- GitLab