[Fluid] Move distributed_fused_lamb_init to phi (#55993)

0bc369ef · Zero Rains · GitHub · e358ddac · 0bc369ef · 0bc369ef
6 changed file
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"

 namespace paddle {
 namespace operators {
@@ -116,9 +117,3 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb_init,
                             ops::DistributedFusedLambInitOp,
                             ops::DistributedFusedLambInitOpMaker);
-
-PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb_init,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::DistributedFusedLambInitOpKernel,
-                          float) {}
--- a/paddle/phi/kernels/distributed_fused_lamb_init_kernel.h
+++ b/paddle/phi/kernels/distributed_fused_lamb_init_kernel.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistributedFusedLambInitOpKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    float beta1,
+    float beta2,
+    const std::vector<int>& apply_weight_decay,
+    int alignment,
+    int rank,
+    int nranks,
+    DenseTensor* fp32_fused_param,
+    DenseTensor* fp32_fused_grad,
+    DenseTensor* fp16_fused_param,
+    DenseTensor* fp16_fused_grad,
+    DenseTensor* moment1,
+    DenseTensor* moment2,
+    DenseTensor* beta1_pow,
+    DenseTensor* beta2_pow,
+    DenseTensor* fused_param_offsets,
+    DenseTensor* fp32_shard_fused_param_offsets,
+    DenseTensor* fp16_shard_fused_param_offsets,
+    DenseTensor* param_info,
+    DenseTensor* param_order,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> master_param_out,
+    std::vector<DenseTensor*> grad_out,
+    DenseTensor* global_scale,
+    DenseTensor* step);
+
+}  // namespace phi
--- a/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/distributed_fused_lamb_init_kernel.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void DistributedFusedLambInitOpKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    float beta1,
+    float beta2,
+    const std::vector<int>& apply_weight_decay,
+    int alignment,
+    int rank,
+    int nranks,
+    DenseTensor* fp32_fused_param,
+    DenseTensor* fp32_fused_grad,
+    DenseTensor* fp16_fused_param,
+    DenseTensor* fp16_fused_grad,
+    DenseTensor* moment1,
+    DenseTensor* moment2,
+    DenseTensor* beta1_pow,
+    DenseTensor* beta2_pow,
+    DenseTensor* fused_param_offsets,
+    DenseTensor* fp32_shard_fused_param_offsets,
+    DenseTensor* fp16_shard_fused_param_offsets,
+    DenseTensor* param_info,
+    DenseTensor* param_order,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> master_param_out,
+    std::vector<DenseTensor*> grad_out,
+    DenseTensor* global_scale,
+    DenseTensor* step) {
+  PADDLE_THROW(phi::errors::Unavailable(
+      "Do not support expert count op for cpu kernel now."));
+}
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(distributed_fused_lamb_init,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::fusion::DistributedFusedLambInitOpKernel,
+                   float) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT16);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT16);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(7).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(8).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(9).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(10).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(11).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(12).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(13).SetDataType(kernel_key.dtype());
+  kernel->OutputAt(14).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(15).SetDataType(kernel_key.dtype());
+  kernel->OutputAt(16).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(17).SetDataType(phi::DataType::INT64);
+}
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -14,28 +14,24 @@

 #pragma once

-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"

-namespace paddle {
-namespace operators {
-namespace details {
+namespace phi {

 template <typename InT, typename OutT>
 struct CastFunctor {
  HOSTDEVICE OutT operator()(InT x) const { return static_cast<OutT>(x); }
 };
-
 template <typename InT, typename OutT, int VecSize>
 static void VecCastKernel(const phi::GPUContext &ctx,
                          const InT *x,
                          OutT *y,
                          size_t n) {
-  auto config = platform::GetGpuLaunchConfig1D(ctx, n, VecSize);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, n, VecSize);
  auto block = config.GetGridSize();
  auto thread = config.GetBlockSize();
  auto main_offset = n / (VecSize * thread) * VecSize * thread;
@@ -50,8 +46,6 @@ static void VecCastKernel(const phi::GPUContext &ctx,
          in_arr, out_arr, n, main_offset, VecSize, FunctorT());
 }

-}  // namespace details
-
 template <typename InT, typename OutT>
 static void LaunchCastKernel(const phi::GPUContext &ctx,
                             const InT *x,
@@ -61,20 +55,19 @@ static void LaunchCastKernel(const phi::GPUContext &ctx,
  PADDLE_ENFORCE_NE(
      static_cast<const void *>(x),
      static_cast<void *>(y),
-      platform::errors::InvalidArgument("Inplace cast is not supported yet."));
+      errors::InvalidArgument("Inplace cast is not supported yet."));
  int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y));
  switch (vec_size) {
    case 4:
-      return details::VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
+      return VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
    case 2:
-      return details::VecCastKernel<InT, OutT, 2>(ctx, x, y, n);
+      return VecCastKernel<InT, OutT, 2>(ctx, x, y, n);
    case 1:
-      return details::VecCastKernel<InT, OutT, 1>(ctx, x, y, n);
+      return VecCastKernel<InT, OutT, 1>(ctx, x, y, n);
    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The vectorized size must be 1, 2 or 4."));
+      PADDLE_THROW(
+          errors::InvalidArgument("The vectorized size must be 1, 2 or 4."));
  }
 }

-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,37 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#pragma once
+#include "paddle/phi/core/compat/op_utils.h"

-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
+namespace phi {

-namespace paddle {
-namespace operators {
+KernelSignature DistributedFusedLambInitOpArgumentMapping(
+    const ArgumentMappingContext& ctx UNUSED) {
+  return KernelSignature(
+      "distributed_fused_lamb_init",
+      {"Param", "Grad"},
+      {"beta1", "beta2", "apply_weight_decay", "alignment", "rank", "nranks"},
+      {"FP32FusedParam",
+       "FP32FusedGrad",
+       "FP16FusedParam",
+       "FP16FusedGrad",
+       "Moment1",
+       "Moment2",
+       "Beta1Pow",
+       "Beta2Pow",
+       "FusedParamOffsets",
+       "FP32ShardFusedParamOffsets",
+       "FP16ShardFusedParamOffsets",
+       "ParamInfo",
+       "ParamOrder",
+       "ParamOut",
+       "MasterParamOut",
+       "GradOut",
+       "GlobalScale",
+       "Step"});
+}

-template <typename T, typename DevCtx>
-class DistributedFusedLambInitOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "The distributed_fused_lamb_init operator does not support CPU yet."));
-  }
-};
+}  // namespace phi

-}  // namespace operators
-}  // namespace paddle
+PD_REGISTER_ARG_MAPPING_FN(distributed_fused_lamb_init,
+                           phi::DistributedFusedLambInitOpArgumentMapping);