move dgc kernel to phi (#56003)

3c03ade8 · huangjiyi · GitHub · 6565edcc · 3c03ade8 · 3c03ade8
8 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2570,7 +2570,8 @@ Scope* OperatorWithKernel::PrepareData(
                                                 expected_kernel_key.dtype());
          }
        } else if (in_def != nullptr &&  // KernelRegisteredType is Function
-                   in_def->backend != phi::Backend::ALL_BACKEND) {
+                   in_def->backend != phi::Backend::ALL_BACKEND &&
+                   kernel_type_for_var.backend() != phi::Backend::ALL_BACKEND) {
          auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
          if ((in_def->backend != tensor_backend &&
               !(in_def->backend == phi::Backend::GPUDNN &&

--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/dgc_op.h"
 #include <string>
 #include <vector>

--- a/paddle/fluid/operators/dgc_op.cu
+++ b/paddle/fluid/operators/dgc_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/dgc_op.h"
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(dgc, GPU, ALL_LAYOUT, ops::DGCOpKernel, float) {}
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <vector>
-#include "dgc/dgc.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-namespace paddle {
-namespace operators {
-inline float get_period_sparcity(const std::vector<float>& sparsity,
-                                 float cur_step,
-                                 float rampup_steps) {
-  PADDLE_ENFORCE_GE(static_cast<int>(cur_step),
-                    0,
-                    platform::errors::InvalidArgument(
-                        "DGC current step=%d, but it must >= 0, "
-                        "please submit issue in github",
-                        static_cast<int>(cur_step)));
-  size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
-  if (idx >= sparsity.size()) {
-    idx = sparsity.size() - 1;
-  }
-  PADDLE_ENFORCE_LT(
-      idx,
-      sparsity.size(),
-      platform::errors::OutOfRange(
-          "sparsity index out of bounds. idx=%d >= sparsity.size=%d",
-          idx,
-          sparsity.size()));
-  return sparsity[idx];
-}
-template <typename T, typename DeviceContext>
-class DGCOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto u = ctx.Input<phi::DenseTensor>("U");
-    auto v = ctx.Input<phi::DenseTensor>("V");
-    auto g = ctx.Input<phi::DenseTensor>("Grad");
-    auto grad_out = ctx.Output<phi::DenseTensor>("Grad_out");
-    // attrs
-    float m = ctx.Attr<float>("m");
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto sparsity = ctx.Attr<std::vector<float>>("sparsity");
-    auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
-    auto rampup_step = ctx.Attr<float>("rampup_step");
-    // nranks
-    auto nranks_tensor = ctx.Input<phi::DenseTensor>("nranks");
-    const int nranks = static_cast<int>(*nranks_tensor->data<float>());
-    PADDLE_ENFORCE_GT(nranks,
-                      1,
-                      platform::errors::PreconditionNotMet(
-                          "DGC is not useful when num_trainers <= 1. Please "
-                          "use multi card or multi machine GPU"));
-    // regularization
-    auto p = ctx.Input<phi::DenseTensor>("Param");
-    float regular_coeff = ctx.Attr<float>("regular_coeff");
-    int regular_type = ctx.Attr<int>("regular_type");
-    auto p_e = framework::EigenVector<T>::Flatten(*p);
-    auto g_e = framework::EigenVector<T>::Flatten(*g);
-    auto grad_out_e = framework::EigenVector<T>::Flatten(*grad_out);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto& eigen_ctx = *dev_ctx.eigen_device();
-    // NOTE. In paddle, loss has divided by nranks. Because dgc_op is before
-    // allreduce, so local regular_coeff need div nranks too. But now we
-    // multi grad with nranks in dgc_op, in that case regular_coeff don't
-    // need to /nranks, can prevent precision loss. For coeff often equal
-    // with 1e-4, if nranks=32, coeff/nranks will be 3.125e-6, the numerical
-    // accuracy of coeff/nranks will be too low.
-    PADDLE_ENFORCE_EQ(regular_type >= 0 && regular_type <= 2,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "DGC only support one of None|L1Decay|L2Decay "
-                          "Regularization for now."));
-    if (regular_type == 0) {
-      grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e;
-    } else if (regular_type == 1) {
-      // L1Decay. grad = grad + coeff * sign(param)
-      grad_out_e.device(eigen_ctx) =
-          (1.0 * nranks) * g_e + regular_coeff * p_e.sign();
-    } else if (regular_type == 2) {
-      // L2Decay. grad = grad + coeff * param
-      grad_out_e.device(eigen_ctx) = (1.0 * nranks) * g_e + regular_coeff * p_e;
-    }
-    // current step
-    auto current_step_tensor = ctx.Input<phi::DenseTensor>("current_step");
-    const float* current_step = current_step_tensor->data<float>();
-    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
-      VLOG(10) << "current_step:" << *current_step
-               << " < rampup_begin_step:" << rampup_begin_step
-               << " so does't use dgc";
-      return;
-    }
-    float ratio = 1 - get_period_sparcity(
-                          sparsity,
-                          static_cast<float>(*current_step - rampup_begin_step),
-                          rampup_step);
-    PADDLE_ENFORCE_GE(
-        ratio,
-        0.0,
-        platform::errors::InvalidArgument("DGC sparsity ratio must >= 0"));
-    PADDLE_ENFORCE_LT(
-        ratio,
-        1.0,
-        platform::errors::InvalidArgument("DGC sparsity ratio must < 1"));
-    int k = static_cast<int>(g->numel() * ratio);
-    VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
-             << ", rampup_begin_step:" << rampup_begin_step
-             << ", rampup_step:" << rampup_step
-             << ",  current_step:" << *current_step << ", ratio:" << ratio
-             << ", k:" << k << ", nranks:" << nranks;
-    auto k_out = ctx.Output<phi::DenseTensor>("k");
-    T* k_out_data = k_out->data<T>();
-    *k_out_data = k;
-    auto u_out = ctx.Output<phi::DenseTensor>("U_out");
-    auto v_out = ctx.Output<phi::DenseTensor>("V_out");
-    auto encode_grad_out = ctx.Output<phi::DenseTensor>("EncodeGrad");
-    auto gather_buff = ctx.Output<phi::DenseTensor>("GatherBuff");
-    // FIXME(gongwb): use cublas.
-    auto u_out_e = framework::EigenVector<T>::Flatten(*u_out);
-    auto u_e = framework::EigenVector<T>::Flatten(*u);
-    // calc local momentum from global momentum
-    // NOTE. If grad not multi nranks, need add below code.
-    // if (static_cast<int>(*current_step) ==
-    //     static_cast<int>(rampup_begin_step)) {
-    //   u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e;
-    // }
-    if (use_nesterov) {
-      // u = m * (u + g)
-      u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e);
-      // v = u + v + g
-      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
-      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
-          ctx, g, v, 0, phi::funcs::AddFunctor<T>(), v_out);
-    } else {
-      // u = m * u + g
-      u_out_e.device(eigen_ctx) = m * u_e + grad_out_e;
-      // v = u + v
-      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
-    }
-    T* v_out_data = v_out->mutable_data<T>(ctx.GetPlace());
-    T* u_out_data = u_out->mutable_data<T>(ctx.GetPlace());
-    T* encode_grad_out_data = encode_grad_out->mutable_data<T>(
-        framework::DDim{2 * k}, ctx.GetPlace());
-    gather_buff->mutable_data<T>(framework::DDim{2 * k * nranks},
-                                 ctx.GetPlace());
-    int buf_size = paddle::communication::dgc::get_buffer_size(k);
-    paddle::memory::allocation::AllocationPtr tmp_ious_data;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-      tmp_ious_data = memory::Alloc(
-          dev_ctx.GetPlace(),
-          buf_size,
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    }
-#endif
-    if (platform::is_cpu_place(dev_ctx.GetPlace())) {
-      tmp_ious_data = memory::Alloc(dev_ctx.GetPlace(), buf_size);
-    }
-    void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr());
-    if (!paddle::communication::dgc::k_select(
-            static_cast<void*>(encode_grad_out_data),
-            k,
-            v_out_data,
-            static_cast<int>(v_out->numel()),
-            buf,
-            dev_ctx.stream(),
-            u_out_data)) {
-      // TODO(weihang): owner should polish this error message
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "V_out numel error, V_out numel is %d.", v_out->numel()));
-    }
-    phi::funcs::SetConstant<DeviceContext, T> tset;
-    tset(dev_ctx, grad_out, static_cast<T>(0));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -92,6 +92,10 @@ if(WITH_XPU)
  endif()
 endif()
+if(WITH_DGC)
+  list(APPEND PHI_DEPS dgc)
+endif()
 set(PHI_SRCS
    ${common_srcs}
    ${api_srcs}

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -44,6 +44,10 @@ if(APPLE OR WIN32)
  list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
 endif()
+if(NOT WITH_DGC)
+  list(REMOVE_ITEM kernel_cu "gpu/dgc_kernel.cu")
+endif()
 if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")

--- a/paddle/phi/kernels/dgc_kernel.h
+++ b/paddle/phi/kernels/dgc_kernel.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void DGCKernel(const Context& dev_ctx,
+               const DenseTensor& u,
+               const DenseTensor& v,
+               const DenseTensor& grad,
+               const DenseTensor& param,
+               const DenseTensor& current_step_tensor,
+               const DenseTensor& nranks_tensor,
+               float m,
+               bool use_nesterov,
+               const std::vector<float>& sparsity,
+               float rampup_begin_step,
+               float rampup_step,
+               float regular_coeff,
+               int regular_type,
+               DenseTensor* u_out,
+               DenseTensor* v_out,
+               DenseTensor* encode_grad_out,
+               DenseTensor* grad_out,
+               DenseTensor* k_out,
+               DenseTensor* gather_buff);
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/dgc_kernel.cu
+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/dgc_kernel.h"
+#include <glog/logging.h>
+#include "dgc/dgc.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+inline float get_period_sparcity(const std::vector<float>& sparsity,
+                                 float cur_step,
+                                 float rampup_steps) {
+  PADDLE_ENFORCE_GE(
+      static_cast<int>(cur_step),
+      0,
+      phi::errors::InvalidArgument("DGC current step=%d, but it must >= 0, "
+                                   "please submit issue in github",
+                                   static_cast<int>(cur_step)));
+  size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
+  if (idx >= sparsity.size()) {
+    idx = sparsity.size() - 1;
+  }
+  PADDLE_ENFORCE_LT(
+      idx,
+      sparsity.size(),
+      phi::errors::OutOfRange(
+          "sparsity index out of bounds. idx=%d >= sparsity.size=%d",
+          idx,
+          sparsity.size()));
+  return sparsity[idx];
+}
+template <typename T, typename Context>
+void DGCKernel(const Context& dev_ctx,
+               const DenseTensor& u,
+               const DenseTensor& v,
+               const DenseTensor& grad,
+               const DenseTensor& param,
+               const DenseTensor& current_step_tensor,
+               const DenseTensor& nranks_tensor,
+               float m,
+               bool use_nesterov,
+               const std::vector<float>& sparsity,
+               float rampup_begin_step,
+               float rampup_step,
+               float regular_coeff,
+               int regular_type,
+               DenseTensor* u_out,
+               DenseTensor* v_out,
+               DenseTensor* encode_grad_out,
+               DenseTensor* grad_out,
+               DenseTensor* k_out,
+               DenseTensor* gather_buff) {
+  // nranks
+  const int nranks = static_cast<int>(*nranks_tensor.data<float>());
+  PADDLE_ENFORCE_GT(nranks,
+                    1,
+                    phi::errors::PreconditionNotMet(
+                        "DGC is not useful when num_trainers <= 1. Please "
+                        "use multi card or multi machine GPU"));
+  auto param_e = phi::EigenVector<T>::Flatten(param);
+  auto grad_e = phi::EigenVector<T>::Flatten(grad);
+  auto grad_out_e = phi::EigenVector<T>::Flatten(*grad_out);
+  auto& eigen_ctx = *dev_ctx.eigen_device();
+  // NOTE. In paddle, loss has divided by nranks. Because dgc_op is before
+  // allreduce, so local regular_coeff need div nranks too. But now we
+  // multi grad with nranks in dgc_op, in that case regular_coeff don't
+  // need to /nranks, can prevent precision loss. For coeff often equal
+  // with 1e-4, if nranks=32, coeff/nranks will be 3.125e-6, the numerical
+  // accuracy of coeff/nranks will be too low.
+  PADDLE_ENFORCE_EQ(regular_type >= 0 && regular_type <= 2,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "DGC only support one of None|L1Decay|L2Decay "
+                        "Regularization for now."));
+  if (regular_type == 0) {
+    grad_out_e.device(eigen_ctx) = (1.0 * nranks) * grad_e;
+  } else if (regular_type == 1) {
+    // L1Decay. grad = grad + coeff * sign(param)
+    grad_out_e.device(eigen_ctx) =
+        (1.0 * nranks) * grad_e + regular_coeff * param_e.sign();
+  } else if (regular_type == 2) {
+    // L2Decay. grad = grad + coeff * param
+    grad_out_e.device(eigen_ctx) =
+        (1.0 * nranks) * grad_e + regular_coeff * param_e;
+  }
+  // current step
+  const float* current_step = current_step_tensor.data<float>();
+  if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
+    VLOG(10) << "current_step:" << *current_step
+             << " < rampup_begin_step:" << rampup_begin_step
+             << " so does't use dgc";
+    return;
+  }
+  float ratio = 1 - get_period_sparcity(
+                        sparsity,
+                        static_cast<float>(*current_step - rampup_begin_step),
+                        rampup_step);
+  PADDLE_ENFORCE_GE(
+      ratio, 0.0, phi::errors::InvalidArgument("DGC sparsity ratio must >= 0"));
+  PADDLE_ENFORCE_LT(
+      ratio, 1.0, phi::errors::InvalidArgument("DGC sparsity ratio must < 1"));
+  int k = static_cast<int>(grad.numel() * ratio);
+  VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
+           << ", rampup_begin_step:" << rampup_begin_step
+           << ", rampup_step:" << rampup_step
+           << ",  current_step:" << *current_step << ", ratio:" << ratio
+           << ", k:" << k << ", nranks:" << nranks;
+  T* k_out_data = k_out->data<T>();
+  *k_out_data = k;
+  // FIXME(gongwb): use cublas.
+  auto u_out_e = phi::EigenVector<T>::Flatten(*u_out);
+  auto u_e = phi::EigenVector<T>::Flatten(u);
+  // calc local momentum from global momentum
+  // NOTE. If grad not multi nranks, need add below code.
+  // if (static_cast<int>(*current_step) ==
+  //     static_cast<int>(rampup_begin_step)) {
+  //   u_out_e.device(eigen_ctx) = (1.0 / nranks) * u_e;
+  // }
+  if (use_nesterov) {
+    // u = m * (u + grad)
+    u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e);
+    // v = u + v + grad
+    dev_ctx.template Alloc<T>(v_out);
+    phi::funcs::ElementwiseCompute<phi::funcs::AddFunctor<T>, T>(
+        dev_ctx, u, v, phi::funcs::AddFunctor<T>(), v_out, 0);
+    phi::funcs::ElementwiseCompute<phi::funcs::AddFunctor<T>, T>(
+        dev_ctx, grad, v, phi::funcs::AddFunctor<T>(), v_out, 0);
+  } else {
+    // u = m * u + grad
+    u_out_e.device(eigen_ctx) = m * u_e + grad_out_e;
+    // v = u + v
+    dev_ctx.template Alloc<T>(v_out);
+    phi::funcs::ElementwiseCompute<phi::funcs::AddFunctor<T>, T>(
+        dev_ctx, u, v, phi::funcs::AddFunctor<T>(), v_out, 0);
+  }
+  T* v_out_data = dev_ctx.template Alloc<T>(v_out);
+  T* u_out_data = dev_ctx.template Alloc<T>(u_out);
+  encode_grad_out->Resize(phi::DDim{2 * k});
+  T* encode_grad_out_data = dev_ctx.template Alloc<T>(encode_grad_out);
+  gather_buff->Resize(phi::DDim{2 * k * nranks});
+  dev_ctx.template Alloc<T>(gather_buff);
+  int buf_size = paddle::communication::dgc::get_buffer_size(k);
+  phi::Allocator::AllocationPtr tmp_ious_data;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    tmp_ious_data = phi::memory_utils::Alloc(
+        dev_ctx.GetPlace(),
+        buf_size,
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  }
+#endif
+  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU) {
+    tmp_ious_data = phi::memory_utils::Alloc(dev_ctx.GetPlace(), buf_size);
+  }
+  void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr());
+  if (!paddle::communication::dgc::k_select(
+          static_cast<void*>(encode_grad_out_data),
+          k,
+          v_out_data,
+          static_cast<int>(v_out->numel()),
+          buf,
+          dev_ctx.stream(),
+          u_out_data)) {
+    // TODO(weihang): owner should polish this error message
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "V_out numel error, V_out numel is %d.", v_out->numel()));
+  }
+  phi::funcs::SetConstant<Context, T> tset;
+  tset(dev_ctx, grad_out, static_cast<T>(0));
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(dgc, GPU, ALL_LAYOUT, phi::DGCKernel, float) {}