未验证 提交 263a9e97 编写于 作者: M MRXLT 提交者: GitHub

Fix adam (#27778)

* fix adam

* fix gpu adam

* fix code style

* fix ut

* update ut add cuda code
上级 b0edda4d
...@@ -38,7 +38,8 @@ __global__ void AdamKernelREG(T beta1, T beta2, T epsilon, T beta1_pow_, ...@@ -38,7 +38,8 @@ __global__ void AdamKernelREG(T beta1, T beta2, T epsilon, T beta1_pow_,
T mom2 = moment2[id]; T mom2 = moment2[id];
mom1 = beta1 * mom1 + (static_cast<T>(1.0) - beta1) * g; mom1 = beta1 * mom1 + (static_cast<T>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<T>(1.0) - beta2) * g * g; mom2 = beta2 * mom2 + (static_cast<T>(1.0) - beta2) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon)); p -= lr * (mom1 /
(sqrt(mom2) + epsilon * sqrt(static_cast<T>(1.0) - beta2_pow)));
moment1_out[id] = mom1; moment1_out[id] = mom1;
moment2_out[id] = mom2; moment2_out[id] = mom2;
...@@ -68,7 +69,8 @@ __global__ void AdamKernelMEM(T beta1, T beta2, T epsilon, const T* beta1_pow_, ...@@ -68,7 +69,8 @@ __global__ void AdamKernelMEM(T beta1, T beta2, T epsilon, const T* beta1_pow_,
T mom2 = moment2[id]; T mom2 = moment2[id];
mom1 = beta1 * mom1 + (static_cast<T>(1.0) - beta1) * g; mom1 = beta1 * mom1 + (static_cast<T>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<T>(1.0) - beta2) * g * g; mom2 = beta2 * mom2 + (static_cast<T>(1.0) - beta2) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon)); p -= lr * (mom1 /
(sqrt(mom2) + epsilon * sqrt(static_cast<T>(1.0) - beta2_pow)));
moment1_out[id] = mom1; moment1_out[id] = mom1;
moment2_out[id] = mom2; moment2_out[id] = mom2;
...@@ -105,7 +107,8 @@ __global__ void SparseAdamCUDAKernelREG( ...@@ -105,7 +107,8 @@ __global__ void SparseAdamCUDAKernelREG(
T g = row_idx >= 0 ? grad_[row_idx * row_numel + id % row_numel] : 0; T g = row_idx >= 0 ? grad_[row_idx * row_numel + id % row_numel] : 0;
mom1 = beta1 * mom1 + (1 - beta1) * g; mom1 = beta1 * mom1 + (1 - beta1) * g;
mom2 = beta2 * mom2 + (1 - beta2) * g * g; mom2 = beta2 * mom2 + (1 - beta2) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon)); p -= lr * (mom1 / (sqrt(mom2) +
epsilon * sqrt(static_cast<T>(1.0) - beta2_pow)));
// Write back to global memory // Write back to global memory
mom1_out_[id] = mom1; mom1_out_[id] = mom1;
......
...@@ -109,7 +109,7 @@ class AdamFunctor<T, GPUAdam> { ...@@ -109,7 +109,7 @@ class AdamFunctor<T, GPUAdam> {
mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow)));
// Write back to global memory // Write back to global memory
moment1_out_[i] = mom1; moment1_out_[i] = mom1;
...@@ -181,7 +181,9 @@ class AdamFunctor<T, CPUAdam> { ...@@ -181,7 +181,9 @@ class AdamFunctor<T, CPUAdam> {
moment1_out = beta1_ * mom1 + (1 - beta1_) * g; moment1_out = beta1_ * mom1 + (1 - beta1_) * g;
moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g;
param_out = param - lr * (moment1_out / (moment2_out.sqrt() + epsilon_)); param_out = param -
lr * (moment1_out /
(moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow)));
} }
}; };
...@@ -249,7 +251,7 @@ class SparseAdamFunctor<T, GPUAdam> { ...@@ -249,7 +251,7 @@ class SparseAdamFunctor<T, GPUAdam> {
mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow)));
// Write back to global memory // Write back to global memory
moment1_out_[i] = mom1; moment1_out_[i] = mom1;
...@@ -328,7 +330,7 @@ class SparseAdamFunctor<T, CPUAdam> { ...@@ -328,7 +330,7 @@ class SparseAdamFunctor<T, CPUAdam> {
mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow)));
// Write back to global memory // Write back to global memory
moment1_out_[i] = mom1; moment1_out_[i] = mom1;
......
...@@ -25,7 +25,7 @@ from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX ...@@ -25,7 +25,7 @@ from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from predictor_utils import PredictorTools from predictor_utils import PredictorTools
SEED = 2020 SEED = 2000
DATATYPE = 'float32' DATATYPE = 'float32'
program_translator = ProgramTranslator() program_translator = ProgramTranslator()
......
...@@ -22,6 +22,9 @@ import paddle.fluid as fluid ...@@ -22,6 +22,9 @@ import paddle.fluid as fluid
import paddle.fluid.optimizer as optimizer import paddle.fluid.optimizer as optimizer
from paddle.fluid.backward import _append_grad_suffix_ from paddle.fluid.backward import _append_grad_suffix_
import paddle
paddle.enable_static()
np.random.seed(10) np.random.seed(10)
SHAPE = [16, 10] SHAPE = [16, 10]
...@@ -255,8 +258,8 @@ class TestAdamOptimizer(TestOptimizer): ...@@ -255,8 +258,8 @@ class TestAdamOptimizer(TestOptimizer):
moment2_out = beta2 * moment2 + (1. - beta2) * np.square(grad) moment2_out = beta2 * moment2 + (1. - beta2) * np.square(grad)
lr = attr['lr'] * np.sqrt(1. - beta2_pow) / (1. - beta1_pow) lr = attr['lr'] * np.sqrt(1. - beta2_pow) / (1. - beta1_pow)
param_out = param - lr * (moment1_out / param_out = param - lr * (moment1_out / (np.sqrt(moment2_out) + epsilon
(np.sqrt(moment2_out) + epsilon)) * np.sqrt(1 - beta2_pow)))
# update hyper-parameter of optimizer # update hyper-parameter of optimizer
self.param_attr[name]['beta1_pow'] = beta1_pow * beta1 self.param_attr[name]['beta1_pow'] = beta1_pow * beta1
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册