未验证 提交 dda74715 编写于 作者: T taixiurong 提交者: GitHub

xpu-paddlepaddle-57 [任务] adamw lr_radio支持 (#50979)

上级 a8fff38f
...@@ -87,8 +87,23 @@ void AdamwDenseKernel(const Context& dev_ctx, ...@@ -87,8 +87,23 @@ void AdamwDenseKernel(const Context& dev_ctx,
beta1_pow_ptr = xpu_beta1_pow.template data<float>(); beta1_pow_ptr = xpu_beta1_pow.template data<float>();
beta2_pow_ptr = xpu_beta2_pow.template data<float>(); beta2_pow_ptr = xpu_beta2_pow.template data<float>();
} }
if (with_decay) { if (!with_decay) {
int r = xpu::adamw( coeff = static_cast<float>(0.0);
}
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
float* new_lr = RAII_GUARD.alloc_l3_or_gm<float>(learning_rate.numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(new_lr);
int r = 0;
r = xpu::scale(dev_ctx.x_context(),
learning_rate.template data<float>(),
new_lr,
learning_rate.numel(),
false,
lr_ratio,
0.0f);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
r = xpu::adamw(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(grad.template data<T>()), reinterpret_cast<const XPUType*>(grad.template data<T>()),
moment1.template data<float>(), moment1.template data<float>(),
...@@ -96,7 +111,7 @@ void AdamwDenseKernel(const Context& dev_ctx, ...@@ -96,7 +111,7 @@ void AdamwDenseKernel(const Context& dev_ctx,
reinterpret_cast<const XPUType*>(param.template data<T>()), reinterpret_cast<const XPUType*>(param.template data<T>()),
beta1_pow_ptr, beta1_pow_ptr,
beta2_pow_ptr, beta2_pow_ptr,
learning_rate.template data<float>(), new_lr,
dev_ctx.template Alloc<float>(moment1_out), dev_ctx.template Alloc<float>(moment1_out),
dev_ctx.template Alloc<float>(moment2_out), dev_ctx.template Alloc<float>(moment2_out),
reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)), reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
...@@ -106,28 +121,9 @@ void AdamwDenseKernel(const Context& dev_ctx, ...@@ -106,28 +121,9 @@ void AdamwDenseKernel(const Context& dev_ctx,
coeff, coeff,
param.numel()); param.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
} else {
int r = xpu::adam(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(grad.template data<T>()),
moment1.template data<float>(),
moment2.template data<float>(),
reinterpret_cast<const XPUType*>(param.template data<T>()),
beta1_pow_ptr,
beta2_pow_ptr,
learning_rate.template data<float>(),
dev_ctx.template Alloc<float>(moment1_out),
dev_ctx.template Alloc<float>(moment2_out),
reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
beta1_,
beta2_,
epsilon_,
param.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "adam");
}
if (!use_global_beta_pow) { if (!use_global_beta_pow) {
// update in cpu and then copy to xpu // update in cpu
if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
const float* beta1_pow_p = beta1_pow.template data<float>(); const float* beta1_pow_p = beta1_pow.template data<float>();
dev_ctx.template HostAlloc<float>(beta1_pow_out)[0] = dev_ctx.template HostAlloc<float>(beta1_pow_out)[0] =
...@@ -136,7 +132,7 @@ void AdamwDenseKernel(const Context& dev_ctx, ...@@ -136,7 +132,7 @@ void AdamwDenseKernel(const Context& dev_ctx,
dev_ctx.template HostAlloc<float>(beta2_pow_out)[0] = dev_ctx.template HostAlloc<float>(beta2_pow_out)[0] =
beta2_ * beta2_pow_p[0]; beta2_ * beta2_pow_p[0];
xpu_wait(dev_ctx.x_context()->xpu_stream); xpu_wait(dev_ctx.x_context()->xpu_stream);
} else { } else { // update in xpu
float* beta1_pow_out_p = dev_ctx.template Alloc<float>(beta1_pow_out); float* beta1_pow_out_p = dev_ctx.template Alloc<float>(beta1_pow_out);
float* beta2_pow_out_p = dev_ctx.template Alloc<float>(beta2_pow_out); float* beta2_pow_out_p = dev_ctx.template Alloc<float>(beta2_pow_out);
int r = xpu::scale(dev_ctx.x_context(), int r = xpu::scale(dev_ctx.x_context(),
......
...@@ -17,6 +17,7 @@ import sys ...@@ -17,6 +17,7 @@ import sys
sys.path.append("..") sys.path.append("..")
import unittest import unittest
from functools import partial
import numpy as np import numpy as np
from op_test_xpu import XPUOpTest from op_test_xpu import XPUOpTest
...@@ -301,6 +302,345 @@ class XPUTestAdamwOp2(XPUOpTestWrapper): ...@@ -301,6 +302,345 @@ class XPUTestAdamwOp2(XPUOpTestWrapper):
adam.step() adam.step()
adam.clear_gradients() adam.clear_gradients()
class TestAdamWOpLayerwiseLR(TestAdamWOp):
def setUp(self):
np.random.seed(2022)
paddle.seed(2022)
def test_adamw_op_dygraph(self):
paddle.disable_static()
linear1 = paddle.nn.Linear(
13, 8, bias_attr=paddle.nn.initializer.Constant(value=1.0)
)
linear2 = paddle.nn.Linear(
8, 5, bias_attr=paddle.nn.initializer.Constant(value=1.0)
)
# fix the linear name, simple_lr_setting function will use the name
linear1.weight.name = "linear_1.w_0"
linear1.bias.name = "linear_1.b_0"
linear2.weight.name = "linear_2.w_0"
linear2.bias.name = "linear_2.b_0"
fc1_w = np.array(linear1.weight)
fc1_w_mon1 = np.zeros_like(fc1_w)
fc1_w_mon2 = np.zeros_like(fc1_w)
fc1_b = np.array(linear1.bias)
fc1_b_mon1 = np.zeros_like(fc1_b)
fc1_b_mon2 = np.zeros_like(fc1_b)
fc2_w = np.array(linear2.weight)
fc2_w_mon1 = np.zeros_like(fc2_w)
fc2_w_mon2 = np.zeros_like(fc2_w)
fc2_b = np.array(linear2.bias)
fc2_b_mon1 = np.zeros_like(fc2_b)
fc2_b_mon2 = np.zeros_like(fc2_b)
simple_lr_fun = partial(
simple_lr_setting, decay_rate=0.8, n_layers=2
)
learning_rate = 0.001
weight_decay = 0.01
beta1 = 0.9
beta2 = 0.999
opt = paddle.optimizer.AdamW(
learning_rate=learning_rate,
parameters=[
{'params': linear1.parameters()},
{
'params': linear2.parameters(),
},
],
apply_decay_param_fun=lambda name: True,
weight_decay=weight_decay,
lr_ratio=simple_lr_fun,
)
def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
np_inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1**t]).astype("float32"),
'Beta2Pow': np.array([beta2**t]).astype("float32"),
}
np_attrs = {
'epsilon': 1e-8,
'beta1': beta1,
'beta2': beta2,
"lr_ratio": lr_ratio,
"coeff": weight_decay,
"with_decay": True,
}
param_out, moment1_out, moment2_out = adamw_step(
np_inputs, np_attrs
)
return param_out, moment1_out, moment2_out
for i in range(5):
a = paddle.to_tensor(
np.random.uniform(-1, 1, (2, 13)).astype("float32")
)
a1 = linear1(a)
out = linear2(a1)
out = paddle.mean(out)
out.backward()
fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
fc1_w,
np.array(linear1.weight.grad),
fc1_w_mon1,
fc1_w_mon2,
simple_lr_fun(linear1.weight),
i + 1,
)
fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
fc1_b,
np.array(linear1.bias.grad),
fc1_b_mon1,
fc1_b_mon2,
simple_lr_fun(linear1.bias),
i + 1,
)
fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
fc2_w,
np.array(linear2.weight.grad),
fc2_w_mon1,
fc2_w_mon2,
simple_lr_fun(linear2.weight),
i + 1,
)
fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
fc2_b,
np.array(linear2.bias.grad),
fc2_b_mon1,
fc2_b_mon2,
simple_lr_fun(linear2.bias),
i + 1,
)
opt.step()
opt.clear_gradients()
np.testing.assert_allclose(
linear1.weight.numpy(), fc1_w, rtol=1e-5, atol=1e-5
)
np.testing.assert_allclose(
linear1.bias.numpy(), fc1_b, rtol=1e-5, atol=1e-5
)
np.testing.assert_allclose(
linear2.weight.numpy(), fc2_w, rtol=1e-5, atol=1e-5
)
np.testing.assert_allclose(
linear2.bias.numpy(), fc2_b, rtol=1e-5, atol=1e-5
)
def test_adamw_op(self):
paddle.enable_static()
place = fluid.XPUPlace(0)
learning_rate = 0.0001
beta1 = 0.85
beta2 = 0.95
weight_decay = 0.01
epsilon = 1e-8
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
x = fluid.data(name='x', shape=[None, 10], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
weight_attr1 = paddle.framework.ParamAttr(
name="linear_0.w_0"
)
bias_attr1 = paddle.framework.ParamAttr(
name="linear_0.b_0",
initializer=paddle.nn.initializer.Constant(value=1.0),
)
weight_attr2 = paddle.framework.ParamAttr(
name="linear_1.w_0"
)
bias_attr2 = paddle.framework.ParamAttr(
name="linear_1.b_0",
initializer=paddle.nn.initializer.Constant(value=1.0),
)
linear1 = paddle.nn.Linear(
10, 32, weight_attr=weight_attr1, bias_attr=bias_attr1
)
linear2 = paddle.nn.Linear(
32, 1, weight_attr=weight_attr2, bias_attr=bias_attr2
)
out = linear1(x)
out = linear2(out)
fc1_w_mon1 = np.zeros((linear1.weight.shape)).astype(
"float32"
)
fc1_w_mon2 = np.zeros((linear1.weight.shape)).astype(
"float32"
)
fc1_b_mon1 = np.zeros((linear1.bias.shape)).astype(
"float32"
)
fc1_b_mon2 = np.zeros((linear1.bias.shape)).astype(
"float32"
)
fc2_w_mon1 = np.zeros((linear2.weight.shape)).astype(
"float32"
)
fc2_w_mon2 = np.zeros((linear2.weight.shape)).astype(
"float32"
)
fc2_b_mon1 = np.zeros((linear2.bias.shape)).astype(
"float32"
)
fc2_b_mon2 = np.zeros((linear2.bias.shape)).astype(
"float32"
)
cost = paddle.nn.functional.square_error_cost(
input=out, label=y
)
avg_cost = paddle.mean(cost)
simple_lr_fun = partial(
simple_lr_setting, decay_rate=0.8, n_layers=2
)
opt = paddle.optimizer.AdamW(
learning_rate=learning_rate,
beta1=beta1,
beta2=beta2,
weight_decay=weight_decay,
epsilon=epsilon,
lr_ratio=simple_lr_fun,
)
opt.minimize(avg_cost)
def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
np_inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1**t]).astype("float32"),
'Beta2Pow': np.array([beta2**t]).astype("float32"),
}
np_attrs = {
'epsilon': epsilon,
'beta1': beta1,
'beta2': beta2,
"lr_ratio": lr_ratio,
"coeff": weight_decay,
"with_decay": True,
}
param_out, moment1_out, moment2_out = adamw_step(
np_inputs, np_attrs
)
return param_out, moment1_out, moment2_out
fetch_list1 = [
"linear_0.w_0",
"linear_0.b_0",
"linear_1.w_0",
"linear_1.b_0",
]
fetch_list2 = [
"linear_0.w_0",
"linear_0.w_0@GRAD",
"linear_0.b_0",
"linear_0.b_0@GRAD",
"linear_1.w_0",
"linear_1.w_0@GRAD",
"linear_1.b_0",
"linear_1.b_0@GRAD",
]
exe = fluid.Executor(place)
exe.run(startup)
test_prog = train_prog.clone(for_test=True)
for i in range(5):
inputs = np.random.random(size=[8, 10]).astype('float32')
outputs = np.random.random(size=[8, 1]).astype('float32')
param = exe.run(
test_prog,
feed={"x": inputs, "y": outputs},
fetch_list=fetch_list1,
)
params_and_gras = exe.run(
train_prog,
feed={"x": inputs, "y": outputs},
fetch_list=fetch_list2,
)
fc1_w = param[0]
fc1_w_grad = params_and_gras[1]
fc1_b = param[1]
fc1_b_grad = params_and_gras[3]
fc2_w = param[2]
fc2_w_grad = params_and_gras[5]
fc2_b = param[3]
fc2_b_grad = params_and_gras[7]
fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
fc1_w,
fc1_w_grad,
fc1_w_mon1,
fc1_w_mon2,
simple_lr_fun(linear1.weight),
i + 1,
)
fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
fc1_b,
fc1_b_grad,
fc1_b_mon1,
fc1_b_mon2,
simple_lr_fun(linear1.bias),
i + 1,
)
fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
fc2_w,
fc2_w_grad,
fc2_w_mon1,
fc2_w_mon2,
simple_lr_fun(linear2.weight),
i + 1,
)
fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
fc2_b,
fc2_b_grad,
fc2_b_mon1,
fc2_b_mon2,
simple_lr_fun(linear2.bias),
i + 1,
)
np.testing.assert_allclose(
params_and_gras[0], fc1_w, rtol=1e-5, atol=1e-5
)
np.testing.assert_allclose(
params_and_gras[2], fc1_b, rtol=1e-5, atol=1e-5
)
np.testing.assert_allclose(
params_and_gras[4], fc2_w, rtol=1e-5, atol=1e-5
)
np.testing.assert_allclose(
params_and_gras[6], fc2_b, rtol=1e-5, atol=1e-5
)
paddle.disable_static()
support_types = get_xpu_op_support_types('adamw') support_types = get_xpu_op_support_types('adamw')
for stype in support_types: for stype in support_types:
......
...@@ -178,9 +178,12 @@ class AdamW(Optimizer): ...@@ -178,9 +178,12 @@ class AdamW(Optimizer):
raise TypeError("weight_decay should be float or Tensor.") raise TypeError("weight_decay should be float or Tensor.")
if lr_ratio is not None: if lr_ratio is not None:
assert isinstance(lr_ratio, Callable) assert isinstance(lr_ratio, Callable)
if not core.is_compiled_with_cuda(): if (
not core.is_compiled_with_cuda()
and not core.is_compiled_with_xpu()
):
raise NotImplementedError( raise NotImplementedError(
"'lr_ratio' is unimplemented in CPU, XPU and NPU" "'lr_ratio' is unimplemented in CPU, and NPU"
) )
if parameters is not None: if parameters is not None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册