diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h index d62254220d5da3176c4eded29c0f4fc886862b9e..89b8d54a463b03076c9489b842540ea4a4f68a82 100644 --- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h +++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h @@ -115,10 +115,10 @@ class LargeScaleFuseAdamOpKernel "param_row should have the same size with grad_row")); T lr_ = lr[0]; - T beta1_ = beta1_pow->data()[0]; - T beta2_ = beta2_pow->data()[0]; + T beta1_pow_ = beta1_pow->data()[0]; + T beta2_pow_ = beta2_pow->data()[0]; - lr_ *= sqrt(1 - beta1_) / (1 - beta2_); + lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); for (size_t i = 0; i < in_rows.size(); i++) { auto ¶ms = values[i][0]; @@ -131,8 +131,8 @@ class LargeScaleFuseAdamOpKernel for (int x = 0; x < grad_width; ++x) { auto g = grad_v.data()[grad_width * i + x]; - m1_data[x] = beta1_ * m1_data[x] + (1 - beta1_) * g; - m2_data[x] = beta2_ * m2_data[x] + (1 - beta2_) * g * g; + m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g; + m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g; p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon)); } }