[Fluid] move lars_momentum to phi (#55798)

* [Fluid] move lars_momentum to phi * add sig * fix optional Output * off check_dygraph * fix input * fix operator[] * fix * try fix AllocateTmpTensor * fix * fix type * Update paddle/phi/kernels/gpu/lars_momentum_kernel.cu * fix type * rollback * Add Registration * try fix win * try fix win * try use double * try use operator *(float,const Derived &) * try auto * fix * fix * fix * fix dtype * fix type * fix index

[Fluid] move lars_momentum to phi (#55798)
* [Fluid] move lars_momentum to phi * add sig * fix optional Output * off check_dygraph * fix input * fix operator[] * fix * try fix AllocateTmpTensor * fix * fix type * Update paddle/phi/kernels/gpu/lars_momentum_kernel.cu * fix type * rollback * Add Registration * try fix win * try fix win * try use double * try use operator *(float,const Derived &) * try auto * fix * fix * fix * fix dtype * fix type * fix index
b0c2ee26 · gouzil · GitHub · 6839a7b9 · b0c2ee26 · 6839a7b9
8 changed file
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
@@ -233,6 +234,3 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    ops::LarsMomentumOpVarTypeInference);
-PD_REGISTER_STRUCT_KERNEL(
-    lars_momentum, CPU, ALL_LAYOUT, ops::LarsMomentumOpKernel, float, double) {}
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-template <typename T, typename DeviceContext>
-class LarsMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    auto param = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto velocity = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    auto learning_rate = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    auto grad = ctx.MultiInput<phi::DenseTensor>("Grad");
-    auto weight_decay_arr = ctx.Attr<std::vector<float>>("lars_weight_decay");
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T epsilon = ctx.Attr<float>("epsilon");
-    T rescale_grad = ctx.Attr<float>("rescale_grad");
-    int op_num = param.size();
-    for (int i = 0; i < op_num; ++i) {
-      auto* lr = learning_rate[i]->data<T>();
-      T lars_weight_decay = weight_decay_arr[i];
-      param_out[i]->mutable_data<T>(ctx.GetPlace());
-      velocity_out[i]->mutable_data<T>(ctx.GetPlace());
-      auto p_out = framework::EigenVector<T>::Flatten(*(param_out[i]));
-      auto v_out = framework::EigenVector<T>::Flatten(*(velocity_out[i]));
-      auto p = framework::EigenVector<T>::Flatten(*(param[i]));
-      auto v = framework::EigenVector<T>::Flatten(*(velocity[i]));
-      auto g = framework::EigenVector<T>::Flatten(*(grad[i]));
-      auto rescale_g = rescale_grad * g;
-      phi::DenseTensor p_norm_t, g_norm_t;
-      p_norm_t.Resize({1});
-      g_norm_t.Resize({1});
-      p_norm_t.mutable_data<T>(ctx.GetPlace());
-      g_norm_t.mutable_data<T>(ctx.GetPlace());
-      auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
-      auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
-      ep_norm = p.square().sum().sqrt();
-      eg_norm = rescale_g.square().sum().sqrt();
-      T local_lr = lr[0];
-      if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
-        local_lr = lr[0] * lars_coeff * ep_norm(0) /
-                   (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
-      }
-      v_out = v * mu + local_lr * (rescale_g + lars_weight_decay * p);
-      p_out = p - v_out;
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 namespace paddle {

--- a/paddle/phi/kernels/cpu/lars_momentum_kernel.cc
+++ b/paddle/phi/kernels/cpu/lars_momentum_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/lars_momentum_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+template <typename T, typename Context>
+void LarsMomentumKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& velocity,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& grad,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const std::vector<float>& weight_decay_arr,
+    float mu,
+    float lars_coeff,
+    float epsilon,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> velocity_out,
+    std::vector<DenseTensor*> master_param_out) {
+  int op_num = param.size();
+  T mu_ = static_cast<T>(mu);
+  for (int i = 0; i < op_num; ++i) {
+    auto* lr = learning_rate[i]->data<T>();
+    T lars_weight_decay = weight_decay_arr[i];
+    dev_ctx.template Alloc<T>(param_out[i]);
+    dev_ctx.template Alloc<T>(velocity_out[i]);
+    auto p_out = phi::EigenVector<T>::Flatten(*(param_out[i]));
+    auto v_out = phi::EigenVector<T>::Flatten(*(velocity_out[i]));
+    auto p = phi::EigenVector<T>::Flatten(*(param[i]));
+    auto v = phi::EigenVector<T>::Flatten(*(velocity[i]));
+    Eigen::TensorMap<Eigen::Tensor<const T, 1, 1>> g =
+        phi::EigenVector<T>::Flatten(*(grad[i]));
+    auto rescale_g = static_cast<T>(rescale_grad) * g;
+    phi::DenseTensor p_norm_t, g_norm_t;
+    p_norm_t.Resize({1});
+    g_norm_t.Resize({1});
+    dev_ctx.template Alloc<T>(&p_norm_t);
+    dev_ctx.template Alloc<T>(&g_norm_t);
+    auto ep_norm = phi::EigenScalar<T>::From(p_norm_t);
+    auto eg_norm = phi::EigenScalar<T>::From(g_norm_t);
+    ep_norm = p.square().sum().sqrt();
+    eg_norm = rescale_g.square().sum().sqrt();
+    T local_lr = lr[0];
+    if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
+      local_lr = lr[0] * lars_coeff * ep_norm(0) /
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
+    }
+    v_out = v * mu_ + local_lr * (rescale_g + lars_weight_decay * p);
+    p_out = p - v_out;
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    lars_momentum, CPU, ALL_LAYOUT, phi::LarsMomentumKernel, float, double) {}
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
+#include "paddle/phi/kernels/lars_momentum_kernel.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+#include "paddle/utils/optional.h"
 #if CUDA_VERSION >= 11000
 #include <cooperative_groups.h>
@@ -30,8 +33,7 @@ limitations under the License. */
 #define LARS_MAX_MERGED_OPS 60
-namespace paddle {
+namespace phi {
-namespace operators {
 template <typename T>
 using MultiPrecisionType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -253,7 +255,7 @@ __forceinline__ __device__ void MomentumUpdate(
                                                              master_param_out);
  } else {
    if (std::is_same<T, float>::value ||
-        std::is_same<T, paddle::platform::float16>::value) {
+        std::is_same<T, dtype::float16>::value) {
      /* TODO(limingshu): pointer cast may damage memory accessing for fp16 */
      VectorizeLarsUpdate<T, MT, /*VecSize=*/4, /*IsAmp=*/false>(
          grad,
@@ -419,7 +421,7 @@ __global__ void MomentumLarsKernel(const T* param,
 }
 template <typename T, typename MT>
-inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx,
+inline void SeparatedLarsMomentumOpCUDAKernel(const GPUContext& cuda_ctx,
                                              const T* param_data,
                                              T* param_out_data,
                                              const MT* velocity_data,
@@ -474,216 +476,213 @@ inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx,
                              is_amp);
 }
-template <typename T, typename DeviceContext>
+template <typename T, typename Context>
-class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
+void LarsMomentumKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& velocity,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& grad,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const std::vector<float>& weight_decay_arr,
+    float mu,
+    float lars_coeff,
+    float epsilon,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> velocity_out,
+    std::vector<DenseTensor*> master_param_out) {
  using MT = MultiPrecisionType<T>;
+  int num_blocks_per_sm = 0;
- public:
+  int sm_num = dev_ctx.GetSMCount();
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  // phi::DenseTensor tmp_buffer_t = ctx.AllocateTmpTensor<MT, phi::GPUContext>(
-    int num_blocks_per_sm = 0;
+  //     {LARS_BLOCK_SIZE << 1}, cuda_ctx);
-    bool multi_precision = ctx.Attr<bool>("multi_precision");
+  phi::DenseTensor tmp_buffer_t;
-    auto& cuda_ctx = ctx.template device_context<phi::GPUContext>();
+  tmp_buffer_t.Resize({LARS_BLOCK_SIZE << 1});
-    int sm_num = cuda_ctx.GetSMCount();
+  MT* p_buffer = dev_ctx.template Alloc<MT>(&tmp_buffer_t);
-    phi::DenseTensor tmp_buffer_t = ctx.AllocateTmpTensor<MT, phi::GPUContext>(
+  MT* g_buffer = p_buffer + LARS_BLOCK_SIZE;
-        {LARS_BLOCK_SIZE << 1}, cuda_ctx);
-    auto* p_buffer = tmp_buffer_t.mutable_data<MT>(ctx.GetPlace());
+  MT mu_ = static_cast<MT>(mu);
-    auto* g_buffer = p_buffer + LARS_BLOCK_SIZE;
+  MT lars_coeff_ = static_cast<MT>(lars_coeff);
+  MT epsilon_ = static_cast<MT>(epsilon);
-    MT mu = static_cast<MT>(ctx.Attr<float>("mu"));
+  MT rescale_grad_ = static_cast<MT>(rescale_grad);
-    MT lars_coeff = static_cast<MT>(ctx.Attr<float>("lars_coeff"));
-    MT epsilon = static_cast<MT>(ctx.Attr<float>("epsilon"));
+  int op_num = grad.size();
-    MT rescale_grad = static_cast<MT>(ctx.Attr<float>("rescale_grad"));
-    auto weight_decay_arr = ctx.Attr<std::vector<float>>("lars_weight_decay");
-    auto grad = ctx.MultiInput<phi::DenseTensor>("Grad");
-    auto param = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto velocity = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    auto param_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    auto learning_rate = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    auto master_param = ctx.MultiInput<phi::DenseTensor>("MasterParam");
-    auto master_param_out = ctx.MultiOutput<phi::DenseTensor>("MasterParamOut");
-    int op_num = grad.size();
 #if CUDA_VERSION >= 11000
-    if (op_num > 1) {
+  if (op_num > 1) {
-      LarsParamWarpper<T, MT> lars_warpper;
+    LarsParamWarpper<T, MT> lars_warpper;
-      PADDLE_ENFORCE_LT(
+    PADDLE_ENFORCE_LT(
-          op_num,
+        op_num,
-          LARS_MAX_MERGED_OPS,
+        LARS_MAX_MERGED_OPS,
-          platform::errors::InvalidArgument(
+        errors::InvalidArgument(
-              "The maximum number of merged-ops supported is (%d), but"
+            "The maximum number of merged-ops supported is (%d), but"
-              "lars op required for trainning this model is (%d)\n",
+            "lars op required for trainning this model is (%d)\n",
-              LARS_MAX_MERGED_OPS,
+            LARS_MAX_MERGED_OPS,
-              op_num));
+            op_num));
-      /* Implementation of lars optimizer consists of following two steps:
+    /* Implementation of lars optimizer consists of following two steps:
-        1. Figure out the L2 norm statistic result of grad data and param data.
+      1. Figure out the L2 norm statistic result of grad data and param data.
-        2. Update param and velocity with usage of L2 norm statistic result.
+      2. Update param and velocity with usage of L2 norm statistic result.
-      Step1 and step2 can be merged with api provided by nvida
+    Step1 and step2 can be merged with api provided by nvida
-        cudaLaunchCooperativeKernel:
+      cudaLaunchCooperativeKernel:
-        - The thread quantity shall less than pyhsical SM limited threads
+      - The thread quantity shall less than pyhsical SM limited threads
-        - Launche as thread-block can synchronizlly execute. */
+      - Launche as thread-block can synchronizlly execute. */
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &num_blocks_per_sm,
+        &num_blocks_per_sm,
-          MergedMomentumLarsKernel<T, MT>,
+        MergedMomentumLarsKernel<T, MT>,
-          LARS_BLOCK_SIZE,
+        LARS_BLOCK_SIZE,
-          sizeof(MT) << 1);
+        sizeof(MT) << 1);
-      size_t total_numel = 0;
+    size_t total_numel = 0;
-      for (int i = 0; i < op_num; ++i) {
+    for (int i = 0; i < op_num; ++i) {
-        size_t temp_numel = param[i]->numel();
+      size_t temp_numel = param[i]->numel();
-        total_numel += temp_numel;
+      total_numel += temp_numel;
-        lars_warpper.numel_arr[i] = temp_numel;
+      lars_warpper.numel_arr[i] = temp_numel;
-        lars_warpper.g_arr[i] = grad[i]->data<T>();
+      lars_warpper.g_arr[i] = grad[i]->data<T>();
-        lars_warpper.lr_arr[i] = learning_rate[i]->data<MT>();
+      lars_warpper.lr_arr[i] = learning_rate[i]->data<MT>();
-        lars_warpper.p_out_arr[i] =
+      lars_warpper.p_out_arr[i] = dev_ctx.template Alloc<T>(param_out[i]);
-            param_out[i]->mutable_data<T>(ctx.GetPlace());
+      lars_warpper.v_out_arr[i] = dev_ctx.template Alloc<MT>(velocity_out[i]);
-        lars_warpper.v_out_arr[i] =
+      lars_warpper.weight_decay_arr[i] = static_cast<MT>(weight_decay_arr[i]);
-            velocity_out[i]->mutable_data<MT>(ctx.GetPlace());
+      PADDLE_ENFORCE_EQ(
-        lars_warpper.weight_decay_arr[i] = static_cast<MT>(weight_decay_arr[i]);
+          param[i]->data<T>(),
-        PADDLE_ENFORCE_EQ(
+          lars_warpper.p_out_arr[i],
-            param[i]->data<T>(),
+          errors::InvalidArgument(
-            lars_warpper.p_out_arr[i],
+              "Input(Param) and Output(ParamOut) must be the same Tensors."));
-            platform::errors::InvalidArgument(
+      PADDLE_ENFORCE_EQ(velocity[i]->data<MT>(),
-                "Input(Param) and Output(ParamOut) must be the same Tensors."));
+                        lars_warpper.v_out_arr[i],
-        PADDLE_ENFORCE_EQ(velocity[i]->data<MT>(),
+                        errors::InvalidArgument(
-                          lars_warpper.v_out_arr[i],
+                            "Input(Velocity) and Output(VelocityOut) must be "
-                          platform::errors::InvalidArgument(
+                            "the same Tensors."));
-                              "Input(Velocity) and Output(VelocityOut) must be "
+    }
-                              "the same Tensors."));
+    int64_t avg_numel = total_numel / op_num;
-      }
+    LarsThreadConfig<float> lars_thread_config(
-      int64_t avg_numel = total_numel / op_num;
+        avg_numel, sm_num, num_blocks_per_sm);
-      LarsThreadConfig<float> lars_thread_config(
+    for (int i = 0; i < op_num; ++i) {
-          avg_numel, sm_num, num_blocks_per_sm);
+      lars_warpper.repeat_arr[i] =
+          lars_thread_config.GetRepeatTimes(lars_warpper.numel_arr[i]);
+    }
+    if (multi_precision) {
      for (int i = 0; i < op_num; ++i) {
-        lars_warpper.repeat_arr[i] =
+        lars_warpper.master_p_out_arr[i] =
-            lars_thread_config.GetRepeatTimes(lars_warpper.numel_arr[i]);
+            dev_ctx.template Alloc<MT>(master_param_out[i]);
-      }
+        PADDLE_ENFORCE_EQ(master_param.get()[i]->data<MT>(),
-      if (multi_precision) {
+                          lars_warpper.master_p_out_arr[i],
-        for (int i = 0; i < op_num; ++i) {
+                          errors::InvalidArgument(
-          lars_warpper.master_p_out_arr[i] =
+                              "Input(MasterParam) and Output(MasterParamOut) "
-              master_param_out[i]->mutable_data<MT>(ctx.GetPlace());
+                              "must be the same Tensors."));
-          PADDLE_ENFORCE_EQ(master_param[i]->data<MT>(),
-                            lars_warpper.master_p_out_arr[i],
-                            platform::errors::InvalidArgument(
-                                "Input(MasterParam) and Output(MasterParamOut) "
-                                "must be the same Tensors."));
-        }
      }
-      void* cuda_param[] = {reinterpret_cast<void*>(&lars_warpper),
-                            reinterpret_cast<void*>(&p_buffer),
-                            reinterpret_cast<void*>(&g_buffer),
-                            reinterpret_cast<void*>(&op_num),
-                            reinterpret_cast<void*>(&mu),
-                            reinterpret_cast<void*>(&lars_coeff),
-                            reinterpret_cast<void*>(&epsilon),
-                            reinterpret_cast<void*>(&rescale_grad),
-                            reinterpret_cast<void*>(&multi_precision)};
-      // Lanuch all sm theads, and thead of each block synchronizedly cooperate.
-      cudaLaunchCooperativeKernel(
-          reinterpret_cast<void*>(MergedMomentumLarsKernel<T, MT>),
-          lars_thread_config.grid_for_lars,
-          LARS_BLOCK_SIZE,
-          cuda_param,
-          0,
-          cuda_ctx.stream());
-    } else {
-      auto* param_data = param[0]->data<T>();
-      auto* grad_data = grad[0]->data<T>();
-      auto* velocity_data = velocity[0]->data<MT>();
-      auto* lr = learning_rate[0]->data<MT>();
-      auto* param_out_data = param_out[0]->mutable_data<T>(ctx.GetPlace());
-      auto* velocity_out_data =
-          velocity_out[0]->mutable_data<MT>(ctx.GetPlace());
-      const MT* master_param_data =
-          multi_precision ? master_param[0]->data<MT>() : nullptr;
-      MT* master_param_out_data =
-          multi_precision
-              ? master_param_out[0]->mutable_data<MT>(ctx.GetPlace())
-              : nullptr;
-      int64_t numel = param[0]->numel();
-      MT lars_weight_decay = weight_decay_arr[0];
-      // Figure out how many blocks can be active in each sm.
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
-                                                    MomentumLarsKernel<T, MT>,
-                                                    LARS_BLOCK_SIZE,
-                                                    sizeof(MT) << 1);
-      LarsThreadConfig<float> lars_thread_config(
-          numel, sm_num, num_blocks_per_sm);
-      int repeat_times = lars_thread_config.GetRepeatTimes(numel);
-      int thresh = 0;
-      void* cuda_param[] = {
-          reinterpret_cast<void*>(&param_data),
-          reinterpret_cast<void*>(&grad_data),
-          reinterpret_cast<void*>(&velocity_data),
-          reinterpret_cast<void*>(&param_out_data),
-          reinterpret_cast<void*>(&velocity_out_data),
-          reinterpret_cast<void*>(&master_param_data),
-          reinterpret_cast<void*>(&master_param_out_data),
-          reinterpret_cast<void*>(&lr),
-          reinterpret_cast<void*>(&p_buffer),
-          reinterpret_cast<void*>(&g_buffer),
-          reinterpret_cast<void*>(&mu),
-          reinterpret_cast<void*>(&lars_coeff),
-          reinterpret_cast<void*>(&lars_weight_decay),
-          reinterpret_cast<void*>(&epsilon),
-          reinterpret_cast<void*>(&rescale_grad),
-          reinterpret_cast<void*>(&repeat_times),
-          reinterpret_cast<void*>(&thresh),  // Just a placeholder
-          reinterpret_cast<void*>(&numel),
-          reinterpret_cast<void*>(&multi_precision)};
-      // Lanuch all sm theads.
-      cudaLaunchCooperativeKernel(
-          reinterpret_cast<void*>(MomentumLarsKernel<T, MT>),
-          lars_thread_config.grid_for_lars,
-          LARS_BLOCK_SIZE,
-          cuda_param,
-          0,
-          cuda_ctx.stream());
    }
+    void* cuda_param[] = {reinterpret_cast<void*>(&lars_warpper),
+                          reinterpret_cast<void*>(&p_buffer),
+                          reinterpret_cast<void*>(&g_buffer),
+                          reinterpret_cast<void*>(&op_num),
+                          reinterpret_cast<void*>(&mu_),
+                          reinterpret_cast<void*>(&lars_coeff_),
+                          reinterpret_cast<void*>(&epsilon_),
+                          reinterpret_cast<void*>(&rescale_grad_),
+                          reinterpret_cast<void*>(&multi_precision)};
+    // Lanuch all sm theads, and thead of each block synchronizedly cooperate.
+    cudaLaunchCooperativeKernel(
+        reinterpret_cast<void*>(MergedMomentumLarsKernel<T, MT>),
+        lars_thread_config.grid_for_lars,
+        LARS_BLOCK_SIZE,
+        cuda_param,
+        0,
+        dev_ctx.stream());
+  } else {
+    auto* param_data = param[0]->data<T>();
+    auto* grad_data = grad[0]->data<T>();
+    auto* velocity_data = velocity[0]->data<MT>();
+    auto* lr = learning_rate[0]->data<MT>();
+    auto* param_out_data = dev_ctx.template Alloc<T>(param_out[0]);
+    auto* velocity_out_data = dev_ctx.template Alloc<MT>(velocity_out[0]);
+    const MT* master_param_data =
+        multi_precision ? master_param.get()[0]->data<MT>() : nullptr;
+    MT* master_param_out_data =
+        multi_precision ? dev_ctx.template Alloc<MT>(master_param_out[0])
+                        : nullptr;
+    int64_t numel = param[0]->numel();
+    MT lars_weight_decay = weight_decay_arr[0];
+    // Figure out how many blocks can be active in each sm.
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
+                                                  MomentumLarsKernel<T, MT>,
+                                                  LARS_BLOCK_SIZE,
+                                                  sizeof(MT) << 1);
+    LarsThreadConfig<float> lars_thread_config(
+        numel, sm_num, num_blocks_per_sm);
+    int repeat_times = lars_thread_config.GetRepeatTimes(numel);
+    int thresh = 0;
+    void* cuda_param[] = {
+        reinterpret_cast<void*>(&param_data),
+        reinterpret_cast<void*>(&grad_data),
+        reinterpret_cast<void*>(&velocity_data),
+        reinterpret_cast<void*>(&param_out_data),
+        reinterpret_cast<void*>(&velocity_out_data),
+        reinterpret_cast<void*>(&master_param_data),
+        reinterpret_cast<void*>(&master_param_out_data),
+        reinterpret_cast<void*>(&lr),
+        reinterpret_cast<void*>(&p_buffer),
+        reinterpret_cast<void*>(&g_buffer),
+        reinterpret_cast<void*>(&mu_),
+        reinterpret_cast<void*>(&lars_coeff_),
+        reinterpret_cast<void*>(&lars_weight_decay),
+        reinterpret_cast<void*>(&epsilon_),
+        reinterpret_cast<void*>(&rescale_grad_),
+        reinterpret_cast<void*>(&repeat_times),
+        reinterpret_cast<void*>(&thresh),  // Just a placeholder
+        reinterpret_cast<void*>(&numel),
+        reinterpret_cast<void*>(&multi_precision)};
+    // Lanuch all sm theads.
+    cudaLaunchCooperativeKernel(
+        reinterpret_cast<void*>(MomentumLarsKernel<T, MT>),
+        lars_thread_config.grid_for_lars,
+        LARS_BLOCK_SIZE,
+        cuda_param,
+        0,
+        dev_ctx.stream());
+  }
 #else
-    for (int i = 0; i < op_num; ++i) {
+  for (int i = 0; i < op_num; ++i) {
-      const MT* master_param_data =
+    const MT* master_param_data =
-          multi_precision ? master_param[i]->data<MT>() : nullptr;
+        multi_precision ? master_param.get()[i]->data<MT>() : nullptr;
-      MT* master_param_out_data =
+    MT* master_param_out_data =
-          multi_precision
+        multi_precision ? dev_ctx.template Alloc<MT>(master_param_out[i])
-              ? master_param_out[i]->mutable_data<MT>(ctx.GetPlace())
+                        : nullptr;
-              : nullptr;
+    SeparatedLarsMomentumOpCUDAKernel<T, MT>(
-      SeparatedLarsMomentumOpCUDAKernel<T, MT>(
+        dev_ctx,
-          cuda_ctx,
+        param[i]->data<T>(),
-          param[i]->data<T>(),
+        dev_ctx.template Alloc<T>(param_out[i]),
-          param_out[i]->mutable_data<T>(ctx.GetPlace()),
+        velocity[i]->data<MT>(),
-          velocity[i]->data<MT>(),
+        dev_ctx.template Alloc<MT>(velocity_out[i]),
-          velocity_out[i]->mutable_data<MT>(ctx.GetPlace()),
+        grad[i]->data<T>(),
-          grad[i]->data<T>(),
+        learning_rate[i]->data<MT>(),
-          learning_rate[i]->data<MT>(),
+        p_buffer,
-          p_buffer,
+        g_buffer,
-          g_buffer,
+        mu_,
-          mu,
+        lars_coeff_,
-          lars_coeff,
+        weight_decay_arr[i],
-          weight_decay_arr[i],
+        epsilon_,
-          epsilon,
+        rescale_grad_,
-          rescale_grad,
+        param[i]->numel(),
-          param[i]->numel(),
+        master_param_data,
-          master_param_data,
+        master_param_out_data,
-          master_param_out_data,
+        multi_precision);
-          multi_precision);
+  }
-    }
 #endif
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(lars_momentum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LarsMomentumKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
  }
-};
+}
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(lars_momentum,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::LarsMomentumOpCUDAKernel,
-                          float,
-                          double,
-                          plat::float16) {}
--- a/paddle/phi/kernels/lars_momentum_kernel.h
+++ b/paddle/phi/kernels/lars_momentum_kernel.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void LarsMomentumKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& velocity,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& grad,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const std::vector<float>& weight_decay_arr,
+    float mu,
+    float lars_coeff,
+    float epsilon,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> velocity_out,
+    std::vector<DenseTensor*> master_param_out);
+}  // namespace phi
--- a/paddle/phi/ops/compat/lars_momentum_sig.cc
+++ b/paddle/phi/ops/compat/lars_momentum_sig.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature LarsMomentumOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "lars_momentum",
+      {"Param", "Velocity", "LearningRate", "Grad", "MasterParam"},
+      {"lars_weight_decay",
+       "mu",
+       "lars_coeff",
+       "epsilon",
+       "multi_precision",
+       "rescale_grad"},
+      {"ParamOut", "VelocityOut", "MasterParamOut"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(lars_momentum, phi::LarsMomentumOpArgumentMapping);
--- a/test/legacy_test/test_momentum_op.py
+++ b/test/legacy_test/test_momentum_op.py
@@ -312,7 +312,7 @@ class TestLarsMomentumOp(OpTest):
    def test_check_output(self):
        paddle.enable_static()
-        self.check_output()
+        self.check_output(check_dygraph=False)
    def config(self):
        self.params_num = 1