diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 15c478e531e9c756bdb4296bbc64e65aab331828..ac53eab64917f62bab371fae6cca889724b0888e 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -314,6 +314,13 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, reinterpret_cast(ctx).stream()); } #endif +#if defined(PADDLE_WITH_XPU) + else if (platform::is_xpu_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr, + size); + } +#endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, @@ -341,7 +348,7 @@ inline void TensorToVector(const Tensor& src, BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), @@ -349,6 +356,13 @@ inline void TensorToVector(const Tensor& src, reinterpret_cast(ctx).stream()); } #endif +#if defined(PADDLE_WITH_XPU) + else if (platform::is_xpu_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr, + size); + } +#endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 7536654c5f5ccd7ea18911c9530c5ce42ba9ca3f..edc75bda4abdf732c7e0201d75573b45b0477a13 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -157,6 +157,8 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "shape of this tensor MUST BE [1].") .AsDispensable(); AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); + AddInput("SkipUpdate", "(Tensor, optional), Skip the update or not.") + .AsDispensable(); AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("Moment1Out", "(Tensor) Output first moment"); @@ -265,4 +267,10 @@ REGISTER_OP_VERSION(adam) "In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be " "used in adam op, " "and beta_pow will be updated after all adam op in the model.", - false)); + false)) + .AddCheckpoint( + R"ROC( + Upgrade adam, add 1 dispensable input [SkipUpdate]. + )ROC", + paddle::framework::compatible::OpVersionDesc().NewInput( + "SkipUpdate", "If the value is true, Adam will skip the update.")); diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 2ee2a08bf3bc63c34e18e668e9875d6ef6132951..5c30962cca82594df3d91e29f6a343715d404f6b 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -172,6 +172,42 @@ class AdamOpCUDAKernel : public framework::OpKernel { auto* beta1_pow_out = ctx.Output("Beta1PowOut"); auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + MPDType beta1 = static_cast(ctx.Attr("beta1")); if (ctx.HasInput("Beta1Tensor")) { auto* beta1_tensor = ctx.Input("Beta1Tensor"); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index bbd4179d84d896d16a6d7e0c8a4fcfbdf039a71d..233bef22d83af0926538fed611246264a4700560 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -414,7 +414,6 @@ class AdamOpKernel : public framework::OpKernel { auto* mom1 = ctx.Input("Moment1"); auto* mom2 = ctx.Input("Moment2"); auto* lr = ctx.Input("LearningRate"); - auto* beta1_pow = ctx.Input("Beta1Pow"); auto* beta2_pow = ctx.Input("Beta2Pow"); @@ -424,6 +423,42 @@ class AdamOpKernel : public framework::OpKernel { auto* beta1_pow_out = ctx.Output("Beta1PowOut"); auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + T beta1 = static_cast(ctx.Attr("beta1")); if (ctx.HasInput("Beta1Tensor")) { auto* beta1_tensor = ctx.Input("Beta1Tensor"); @@ -451,6 +486,7 @@ class AdamOpKernel : public framework::OpKernel { epsilon_tensor->numel())); epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() << "beta2_pow.numel() : " << beta2_pow->numel(); VLOG(3) << "param.numel(): " << param->numel(); diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index 70fd546e5042c3ae96ec333c251e72396fef0e59..8b33dc64c4e4f0341939e331ab585a4cabe4321c 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -58,6 +58,42 @@ class AdamNPUKernel : public framework::OpKernel { auto* beta1_pow_out = ctx.Output("Beta1PowOut"); auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 0f5706e428e15454e216af8e1067d31720cbf7c7..172088ebcbd58f137d2586c882d527dbbecf3626 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -59,6 +59,43 @@ class AdamOpXPUKernel : public framework::OpKernel { auto* beta1_pow_out = ctx.Output("Beta1PowOut"); auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy( + param, ctx.GetPlace(), + ctx.template device_context(), ¶m_out); + framework::TensorCopy( + mom1, ctx.GetPlace(), + ctx.template device_context(), &mom1_out); + framework::TensorCopy( + mom2, ctx.GetPlace(), + ctx.template device_context(), &mom2_out); + framework::TensorCopy( + beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, platform::errors::InvalidArgument( "Tensor holds the wrong size, Expected beta1 pow " diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py index 8d3a9baa787a03c5f9069ab9802542f29091e4e7..17dad036185d844ecb3ebcc154d78b349bab33d0 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py @@ -134,6 +134,60 @@ class TestAdamWithEpsilonTensor(OpTest): self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False) +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAdamOpWithSkipUpdate(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + "SkipUpdate": np.array([True]).astype("bool"), + } + + self.attrs = {'epsilon': epsilon} + + self.outputs = { + 'Moment1Out': moment1, + 'Moment2Out': moment2, + 'ParamOut': param, + 'Beta1PowOut': self.inputs['Beta1Pow'], + 'Beta2PowOut': self.inputs['Beta2Pow'], + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False) + + @unittest.skipIf(not paddle.is_compiled_with_npu(), "core is not compiled with NPU") class TestAdamOpWithGlobalBetaPow(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 78ced5691366dbc2b9fe07dbb0a750cf860edf0d..27b66c13aecf33c053217a9980faecee2495953e 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -501,6 +501,55 @@ class TestAdamOpWithGlobalBetaPow(OpTest): self.check_output() +class TestAdamOpWithSkipUpdate(OpTest): + def setUp(self): + '''Test Adam Op with global_beta_pow + ''' + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + beta1 = 0.85 + beta2 = 0.95 + + learning_rate = 0.001 + epsilon = 1e-8 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + "Beta1Tensor": np.array([beta1]).astype("float32"), + "Beta2Tensor": np.array([beta2]).astype("float32"), + "EpsilonTensor": np.array([epsilon]).astype("float32"), + "SkipUpdate": np.array([True]).astype("bool"), + } + + attributes = {'epsilon': epsilon} + + self.attrs = {'use_global_beta_pow': True} + + # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty. + self.outputs = { + 'Moment1Out': moment1, + 'Moment2Out': moment2, + 'ParamOut': param, + 'Beta1PowOut': self.inputs['Beta1Pow'], + 'Beta2PowOut': self.inputs['Beta2Pow'], + } + + def test_check_output(self): + self.check_output() + + class TestAdamOpV2(unittest.TestCase): def test_adam_op(self): place = fluid.CPUPlace()