Functionalize distributed_fused_lamb kernel (#53896)

* update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update HostAlloc * update param name * update cpu kernel * remove kernel header * update * update

Functionalize distributed_fused_lamb kernel (#53896)
* update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update HostAlloc * update param name * update cpu kernel * remove kernel header * update * update
5f8e7d8f · huangjiyi · GitHub · 6e0cf610 · 5f8e7d8f · 5f8e7d8f
5 changed file
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/kernel_registry.h"
 namespace paddle {
 namespace operators {
@@ -170,8 +171,63 @@ REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb,
                             ops::DistributedFusedLambOp,
                             ops::DistributedFusedLambOpMaker);
-PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb,
+namespace phi {
+namespace fusion {
+template <typename T, typename Context>
+void DistributedFusedLambKernel(const Context &dev_ctx,
+                                const std::vector<const DenseTensor *> &param,
+                                const std::vector<const DenseTensor *> &grad,
+                                const paddle::optional<DenseTensor> &fp32_param,
+                                const paddle::optional<DenseTensor> &fp32_grad,
+                                const paddle::optional<DenseTensor> &fp16_param,
+                                const paddle::optional<DenseTensor> &fp16_grad,
+                                const DenseTensor &moment1,
+                                const DenseTensor &moment2,
+                                const DenseTensor &beta1_pow,
+                                const DenseTensor &beta2_pow,
+                                const DenseTensor &param_offsets,
+                                const DenseTensor &fp32_partial_offsets,
+                                const DenseTensor &fp16_partial_offsets,
+                                const DenseTensor &param_info,
+                                const DenseTensor &param_order,
+                                const DenseTensor &learning_rate,
+                                const DenseTensor &global_scale,
+                                int acc_steps,
+                                float beta1,
+                                float beta2,
+                                float epsilon,
+                                float max_global_grad_norm,
+                                float weight_decay,
+                                bool clip_after_allreduce,
+                                bool use_master_param_norm,
+                                bool use_master_acc_grad,
+                                bool is_grad_scaled_by_nranks,
+                                bool use_hierarchical_allreduce,
+                                int64_t nranks,
+                                const std::vector<int> &ring_ids,
+                                DenseTensor *fp32_param_out,
+                                DenseTensor *fp16_param_out,
+                                DenseTensor *fp32_acc_grad,
+                                DenseTensor *fp16_acc_grad,
+                                DenseTensor *moment1_out,
+                                DenseTensor *moment2_out,
+                                DenseTensor *beta1_pow_out,
+                                DenseTensor *beta2_pow_out,
+                                DenseTensor *param_out,
+                                DenseTensor *found_inf,
+                                DenseTensor *acc_step,
+                                DenseTensor *stop_update,
+                                DenseTensor *step) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "The distributed_fused_lamb operator does not support CPU yet."));
+}
+}  // namespace fusion
+}  // namespace phi
+PD_REGISTER_KERNEL(distributed_fused_lamb,
                   CPU,
                   ALL_LAYOUT,
-                          ops::DistributedFusedLambOpKernel,
+                   phi::fusion::DistributedFusedLambKernel,
                   float) {}
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace operators {
-template <typename T, typename DevCtx>
-class DistributedFusedLambOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "The distributed_fused_lamb operator does not support CPU yet."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -18,6 +18,8 @@
 #include "math.h"  // NOLINT
+#include "paddle/phi/core/cuda_stream.h"
 namespace paddle {
 namespace operators {

--- a/paddle/phi/ops/compat/distributed_fused_lamb_sig.cc
+++ b/paddle/phi/ops/compat/distributed_fused_lamb_sig.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature DistributedFusedLambOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("distributed_fused_lamb",
+                         {"Param",
+                          "Grad",
+                          "FP32FusedParam",
+                          "FP32FusedGrad",
+                          "FP16FusedParam",
+                          "FP16FusedGrad",
+                          "Moment1",
+                          "Moment2",
+                          "Beta1Pow",
+                          "Beta2Pow",
+                          "FusedParamOffsets",
+                          "FP32ShardFusedParamOffsets",
+                          "FP16ShardFusedParamOffsets",
+                          "ParamInfo",
+                          "ParamOrder",
+                          "LearningRate",
+                          "GlobalScale"},
+                         {"acc_steps",
+                          "beta1",
+                          "beta2",
+                          "epsilon",
+                          "max_global_grad_norm",
+                          "weight_decay",
+                          "clip_after_allreduce",
+                          "use_master_param_norm",
+                          "use_master_acc_grad",
+                          "is_grad_scaled_by_nranks",
+                          "use_hierarchical_allreduce",
+                          "nranks",
+                          "ring_ids"},
+                         {"FP32FusedParamOut",
+                          "FP16FusedParamOut",
+                          "FP32AccFusedGrad",
+                          "FP16AccFusedGrad",
+                          "Moment1Out",
+                          "Moment2Out",
+                          "Beta1PowOut",
+                          "Beta2PowOut",
+                          "ParamOut",
+                          "FoundInf",
+                          "AccStep",
+                          "StopUpdate",
+                          "Step"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(distributed_fused_lamb,
+                           phi::DistributedFusedLambOpArgumentMapping);