From 263a9e97fd02489a8b3d3006417c120df6021e2a Mon Sep 17 00:00:00 2001 From: MRXLT Date: Wed, 14 Oct 2020 19:01:03 +0800 Subject: [PATCH] Fix adam (#27778) * fix adam * fix gpu adam * fix code style * fix ut * update ut add cuda code --- paddle/fluid/operators/optimizers/adam_op.cu | 9 ++++++--- paddle/fluid/operators/optimizers/adam_op.h | 10 ++++++---- .../tests/unittests/dygraph_to_static/test_bmn.py | 2 +- .../fluid/tests/unittests/test_optimizer_grad.py | 7 +++++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 5373fe15f6..0713237561 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -38,7 +38,8 @@ __global__ void AdamKernelREG(T beta1, T beta2, T epsilon, T beta1_pow_, T mom2 = moment2[id]; mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon)); + p -= lr * (mom1 / + (sqrt(mom2) + epsilon * sqrt(static_cast(1.0) - beta2_pow))); moment1_out[id] = mom1; moment2_out[id] = mom2; @@ -68,7 +69,8 @@ __global__ void AdamKernelMEM(T beta1, T beta2, T epsilon, const T* beta1_pow_, T mom2 = moment2[id]; mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon)); + p -= lr * (mom1 / + (sqrt(mom2) + epsilon * sqrt(static_cast(1.0) - beta2_pow))); moment1_out[id] = mom1; moment2_out[id] = mom2; @@ -105,7 +107,8 @@ __global__ void SparseAdamCUDAKernelREG( T g = row_idx >= 0 ? grad_[row_idx * row_numel + id % row_numel] : 0; mom1 = beta1 * mom1 + (1 - beta1) * g; mom2 = beta2 * mom2 + (1 - beta2) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon)); + p -= lr * (mom1 / (sqrt(mom2) + + epsilon * sqrt(static_cast(1.0) - beta2_pow))); // Write back to global memory mom1_out_[id] = mom1; diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 24e383c871..285362fcd5 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -109,7 +109,7 @@ class AdamFunctor { mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); // Write back to global memory moment1_out_[i] = mom1; @@ -181,7 +181,9 @@ class AdamFunctor { moment1_out = beta1_ * mom1 + (1 - beta1_) * g; moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; - param_out = param - lr * (moment1_out / (moment2_out.sqrt() + epsilon_)); + param_out = param - + lr * (moment1_out / + (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow))); } }; @@ -249,7 +251,7 @@ class SparseAdamFunctor { mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); // Write back to global memory moment1_out_[i] = mom1; @@ -328,7 +330,7 @@ class SparseAdamFunctor { mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); // Write back to global memory moment1_out_[i] = mom1; diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py index f54f70e4b8..c4f5cc9e2b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py @@ -25,7 +25,7 @@ from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from predictor_utils import PredictorTools -SEED = 2020 +SEED = 2000 DATATYPE = 'float32' program_translator = ProgramTranslator() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py index 6dcd985027..69298f0f6a 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py @@ -22,6 +22,9 @@ import paddle.fluid as fluid import paddle.fluid.optimizer as optimizer from paddle.fluid.backward import _append_grad_suffix_ +import paddle +paddle.enable_static() + np.random.seed(10) SHAPE = [16, 10] @@ -255,8 +258,8 @@ class TestAdamOptimizer(TestOptimizer): moment2_out = beta2 * moment2 + (1. - beta2) * np.square(grad) lr = attr['lr'] * np.sqrt(1. - beta2_pow) / (1. - beta1_pow) - param_out = param - lr * (moment1_out / - (np.sqrt(moment2_out) + epsilon)) + param_out = param - lr * (moment1_out / (np.sqrt(moment2_out) + epsilon + * np.sqrt(1 - beta2_pow))) # update hyper-parameter of optimizer self.param_attr[name]['beta1_pow'] = beta1_pow * beta1 -- GitLab