[Fluid] move lars_momentum to phi (#55798)

* [Fluid] move lars_momentum to phi * add sig * fix optional Output * off check_dygraph * fix input * fix operator[] * fix * try fix AllocateTmpTensor * fix * fix type * Update paddle/phi/kernels/gpu/lars_momentum_kernel.cu * fix type * rollback * Add Registration * try fix win * try fix win * try use double * try use operator *(float,const Derived &) * try auto * fix * fix * fix * fix dtype * fix type * fix index

[Fluid] move lars_momentum to phi (#55798)
* [Fluid] move lars_momentum to phi * add sig * fix optional Output * off check_dygraph * fix input * fix operator[] * fix * try fix AllocateTmpTensor * fix * fix type * Update paddle/phi/kernels/gpu/lars_momentum_kernel.cu * fix type * rollback * Add Registration * try fix win * try fix win * try use double * try use operator *(float,const Derived &) * try auto * fix * fix * fix * fix dtype * fix type * fix index
b0c2ee26 · gouzil · GitHub · 6839a7b9 · b0c2ee26 · 6839a7b9
8 changed file
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
 namespace operators {
@@ -233,6 +234,3 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    ops::LarsMomentumOpVarTypeInference);
-
-PD_REGISTER_STRUCT_KERNEL(
-    lars_momentum, CPU, ALL_LAYOUT, ops::LarsMomentumOpKernel, float, double) {}
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class LarsMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    auto param = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto velocity = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    auto learning_rate = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    auto grad = ctx.MultiInput<phi::DenseTensor>("Grad");
-    auto weight_decay_arr = ctx.Attr<std::vector<float>>("lars_weight_decay");
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T epsilon = ctx.Attr<float>("epsilon");
-    T rescale_grad = ctx.Attr<float>("rescale_grad");
-
-    int op_num = param.size();
-    for (int i = 0; i < op_num; ++i) {
-      auto* lr = learning_rate[i]->data<T>();
-      T lars_weight_decay = weight_decay_arr[i];
-      param_out[i]->mutable_data<T>(ctx.GetPlace());
-      velocity_out[i]->mutable_data<T>(ctx.GetPlace());
-
-      auto p_out = framework::EigenVector<T>::Flatten(*(param_out[i]));
-      auto v_out = framework::EigenVector<T>::Flatten(*(velocity_out[i]));
-      auto p = framework::EigenVector<T>::Flatten(*(param[i]));
-      auto v = framework::EigenVector<T>::Flatten(*(velocity[i]));
-      auto g = framework::EigenVector<T>::Flatten(*(grad[i]));
-      auto rescale_g = rescale_grad * g;
-
-      phi::DenseTensor p_norm_t, g_norm_t;
-      p_norm_t.Resize({1});
-      g_norm_t.Resize({1});
-      p_norm_t.mutable_data<T>(ctx.GetPlace());
-      g_norm_t.mutable_data<T>(ctx.GetPlace());
-      auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
-      auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
-      ep_norm = p.square().sum().sqrt();
-      eg_norm = rescale_g.square().sum().sqrt();
-
-      T local_lr = lr[0];
-      if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
-        local_lr = lr[0] * lars_coeff * ep_norm(0) /
-                   (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
-      }
-      v_out = v * mu + local_lr * (rescale_g + lars_weight_decay * p);
-      p_out = p - v_out;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"

 namespace paddle {

--- a/paddle/phi/kernels/cpu/lars_momentum_kernel.cc
+++ b/paddle/phi/kernels/cpu/lars_momentum_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lars_momentum_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LarsMomentumKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& velocity,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& grad,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const std::vector<float>& weight_decay_arr,
+    float mu,
+    float lars_coeff,
+    float epsilon,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> velocity_out,
+    std::vector<DenseTensor*> master_param_out) {
+  int op_num = param.size();
+  T mu_ = static_cast<T>(mu);
+  for (int i = 0; i < op_num; ++i) {
+    auto* lr = learning_rate[i]->data<T>();
+    T lars_weight_decay = weight_decay_arr[i];
+    dev_ctx.template Alloc<T>(param_out[i]);
+    dev_ctx.template Alloc<T>(velocity_out[i]);
+
+    auto p_out = phi::EigenVector<T>::Flatten(*(param_out[i]));
+    auto v_out = phi::EigenVector<T>::Flatten(*(velocity_out[i]));
+    auto p = phi::EigenVector<T>::Flatten(*(param[i]));
+    auto v = phi::EigenVector<T>::Flatten(*(velocity[i]));
+    Eigen::TensorMap<Eigen::Tensor<const T, 1, 1>> g =
+        phi::EigenVector<T>::Flatten(*(grad[i]));
+    auto rescale_g = static_cast<T>(rescale_grad) * g;
+
+    phi::DenseTensor p_norm_t, g_norm_t;
+    p_norm_t.Resize({1});
+    g_norm_t.Resize({1});
+    dev_ctx.template Alloc<T>(&p_norm_t);
+    dev_ctx.template Alloc<T>(&g_norm_t);
+    auto ep_norm = phi::EigenScalar<T>::From(p_norm_t);
+    auto eg_norm = phi::EigenScalar<T>::From(g_norm_t);
+    ep_norm = p.square().sum().sqrt();
+    eg_norm = rescale_g.square().sum().sqrt();
+
+    T local_lr = lr[0];
+    if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
+      local_lr = lr[0] * lars_coeff * ep_norm(0) /
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
+    }
+    v_out = v * mu_ + local_lr * (rescale_g + lars_weight_decay * p);
+    p_out = p - v_out;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    lars_momentum, CPU, ALL_LAYOUT, phi::LarsMomentumKernel, float, double) {}
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
-#include "paddle/fluid/framework/op_registry.h"
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lars_momentum_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+#include "paddle/utils/optional.h"

 #if CUDA_VERSION >= 11000
 #include <cooperative_groups.h>
@@ -30,8 +33,7 @@ limitations under the License. */

 #define LARS_MAX_MERGED_OPS 60

-namespace paddle {
-namespace operators {
+namespace phi {

 template <typename T>
 using MultiPrecisionType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -253,7 +255,7 @@ __forceinline__ __device__ void MomentumUpdate(
                                                              master_param_out);
  } else {
    if (std::is_same<T, float>::value ||
-        std::is_same<T, paddle::platform::float16>::value) {
+        std::is_same<T, dtype::float16>::value) {
      /* TODO(limingshu): pointer cast may damage memory accessing for fp16 */
      VectorizeLarsUpdate<T, MT, /*VecSize=*/4, /*IsAmp=*/false>(
          grad,
@@ -419,7 +421,7 @@ __global__ void MomentumLarsKernel(const T* param,
 }

 template <typename T, typename MT>
-inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx,
+inline void SeparatedLarsMomentumOpCUDAKernel(const GPUContext& cuda_ctx,
                                              const T* param_data,
                                              T* param_out_data,
                                              const MT* velocity_data,
@@ -474,35 +476,37 @@ inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx,
                              is_amp);
 }

-template <typename T, typename DeviceContext>
-class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
+template <typename T, typename Context>
+void LarsMomentumKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& velocity,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& grad,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const std::vector<float>& weight_decay_arr,
+    float mu,
+    float lars_coeff,
+    float epsilon,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> velocity_out,
+    std::vector<DenseTensor*> master_param_out) {
  using MT = MultiPrecisionType<T>;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
  int num_blocks_per_sm = 0;
-    bool multi_precision = ctx.Attr<bool>("multi_precision");
-    auto& cuda_ctx = ctx.template device_context<phi::GPUContext>();
-    int sm_num = cuda_ctx.GetSMCount();
-    phi::DenseTensor tmp_buffer_t = ctx.AllocateTmpTensor<MT, phi::GPUContext>(
-        {LARS_BLOCK_SIZE << 1}, cuda_ctx);
-    auto* p_buffer = tmp_buffer_t.mutable_data<MT>(ctx.GetPlace());
-    auto* g_buffer = p_buffer + LARS_BLOCK_SIZE;
-
-    MT mu = static_cast<MT>(ctx.Attr<float>("mu"));
-    MT lars_coeff = static_cast<MT>(ctx.Attr<float>("lars_coeff"));
-    MT epsilon = static_cast<MT>(ctx.Attr<float>("epsilon"));
-    MT rescale_grad = static_cast<MT>(ctx.Attr<float>("rescale_grad"));
-
-    auto weight_decay_arr = ctx.Attr<std::vector<float>>("lars_weight_decay");
-    auto grad = ctx.MultiInput<phi::DenseTensor>("Grad");
-    auto param = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto velocity = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    auto param_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    auto learning_rate = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    auto master_param = ctx.MultiInput<phi::DenseTensor>("MasterParam");
-    auto master_param_out = ctx.MultiOutput<phi::DenseTensor>("MasterParamOut");
+  int sm_num = dev_ctx.GetSMCount();
+  // phi::DenseTensor tmp_buffer_t = ctx.AllocateTmpTensor<MT, phi::GPUContext>(
+  //     {LARS_BLOCK_SIZE << 1}, cuda_ctx);
+  phi::DenseTensor tmp_buffer_t;
+  tmp_buffer_t.Resize({LARS_BLOCK_SIZE << 1});
+  MT* p_buffer = dev_ctx.template Alloc<MT>(&tmp_buffer_t);
+  MT* g_buffer = p_buffer + LARS_BLOCK_SIZE;
+
+  MT mu_ = static_cast<MT>(mu);
+  MT lars_coeff_ = static_cast<MT>(lars_coeff);
+  MT epsilon_ = static_cast<MT>(epsilon);
+  MT rescale_grad_ = static_cast<MT>(rescale_grad);

  int op_num = grad.size();
 #if CUDA_VERSION >= 11000
@@ -511,7 +515,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_LT(
        op_num,
        LARS_MAX_MERGED_OPS,
-          platform::errors::InvalidArgument(
+        errors::InvalidArgument(
            "The maximum number of merged-ops supported is (%d), but"
            "lars op required for trainning this model is (%d)\n",
            LARS_MAX_MERGED_OPS,
@@ -537,19 +541,17 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
      lars_warpper.numel_arr[i] = temp_numel;
      lars_warpper.g_arr[i] = grad[i]->data<T>();
      lars_warpper.lr_arr[i] = learning_rate[i]->data<MT>();
-        lars_warpper.p_out_arr[i] =
-            param_out[i]->mutable_data<T>(ctx.GetPlace());
-        lars_warpper.v_out_arr[i] =
-            velocity_out[i]->mutable_data<MT>(ctx.GetPlace());
+      lars_warpper.p_out_arr[i] = dev_ctx.template Alloc<T>(param_out[i]);
+      lars_warpper.v_out_arr[i] = dev_ctx.template Alloc<MT>(velocity_out[i]);
      lars_warpper.weight_decay_arr[i] = static_cast<MT>(weight_decay_arr[i]);
      PADDLE_ENFORCE_EQ(
          param[i]->data<T>(),
          lars_warpper.p_out_arr[i],
-            platform::errors::InvalidArgument(
+          errors::InvalidArgument(
              "Input(Param) and Output(ParamOut) must be the same Tensors."));
      PADDLE_ENFORCE_EQ(velocity[i]->data<MT>(),
                        lars_warpper.v_out_arr[i],
-                          platform::errors::InvalidArgument(
+                        errors::InvalidArgument(
                            "Input(Velocity) and Output(VelocityOut) must be "
                            "the same Tensors."));
    }
@@ -563,10 +565,10 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
    if (multi_precision) {
      for (int i = 0; i < op_num; ++i) {
        lars_warpper.master_p_out_arr[i] =
-              master_param_out[i]->mutable_data<MT>(ctx.GetPlace());
-          PADDLE_ENFORCE_EQ(master_param[i]->data<MT>(),
+            dev_ctx.template Alloc<MT>(master_param_out[i]);
+        PADDLE_ENFORCE_EQ(master_param.get()[i]->data<MT>(),
                          lars_warpper.master_p_out_arr[i],
-                            platform::errors::InvalidArgument(
+                          errors::InvalidArgument(
                              "Input(MasterParam) and Output(MasterParamOut) "
                              "must be the same Tensors."));
      }
@@ -575,10 +577,10 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
                          reinterpret_cast<void*>(&p_buffer),
                          reinterpret_cast<void*>(&g_buffer),
                          reinterpret_cast<void*>(&op_num),
-                            reinterpret_cast<void*>(&mu),
-                            reinterpret_cast<void*>(&lars_coeff),
-                            reinterpret_cast<void*>(&epsilon),
-                            reinterpret_cast<void*>(&rescale_grad),
+                          reinterpret_cast<void*>(&mu_),
+                          reinterpret_cast<void*>(&lars_coeff_),
+                          reinterpret_cast<void*>(&epsilon_),
+                          reinterpret_cast<void*>(&rescale_grad_),
                          reinterpret_cast<void*>(&multi_precision)};
    // Lanuch all sm theads, and thead of each block synchronizedly cooperate.
    cudaLaunchCooperativeKernel(
@@ -587,20 +589,18 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
        LARS_BLOCK_SIZE,
        cuda_param,
        0,
-          cuda_ctx.stream());
+        dev_ctx.stream());
  } else {
    auto* param_data = param[0]->data<T>();
    auto* grad_data = grad[0]->data<T>();
    auto* velocity_data = velocity[0]->data<MT>();
    auto* lr = learning_rate[0]->data<MT>();
-      auto* param_out_data = param_out[0]->mutable_data<T>(ctx.GetPlace());
-      auto* velocity_out_data =
-          velocity_out[0]->mutable_data<MT>(ctx.GetPlace());
+    auto* param_out_data = dev_ctx.template Alloc<T>(param_out[0]);
+    auto* velocity_out_data = dev_ctx.template Alloc<MT>(velocity_out[0]);
    const MT* master_param_data =
-          multi_precision ? master_param[0]->data<MT>() : nullptr;
+        multi_precision ? master_param.get()[0]->data<MT>() : nullptr;
    MT* master_param_out_data =
-          multi_precision
-              ? master_param_out[0]->mutable_data<MT>(ctx.GetPlace())
+        multi_precision ? dev_ctx.template Alloc<MT>(master_param_out[0])
                        : nullptr;
    int64_t numel = param[0]->numel();
    MT lars_weight_decay = weight_decay_arr[0];
@@ -625,11 +625,11 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
        reinterpret_cast<void*>(&lr),
        reinterpret_cast<void*>(&p_buffer),
        reinterpret_cast<void*>(&g_buffer),
-          reinterpret_cast<void*>(&mu),
-          reinterpret_cast<void*>(&lars_coeff),
+        reinterpret_cast<void*>(&mu_),
+        reinterpret_cast<void*>(&lars_coeff_),
        reinterpret_cast<void*>(&lars_weight_decay),
-          reinterpret_cast<void*>(&epsilon),
-          reinterpret_cast<void*>(&rescale_grad),
+        reinterpret_cast<void*>(&epsilon_),
+        reinterpret_cast<void*>(&rescale_grad_),
        reinterpret_cast<void*>(&repeat_times),
        reinterpret_cast<void*>(&thresh),  // Just a placeholder
        reinterpret_cast<void*>(&numel),
@@ -641,49 +641,48 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
        LARS_BLOCK_SIZE,
        cuda_param,
        0,
-          cuda_ctx.stream());
+        dev_ctx.stream());
  }
 #else
  for (int i = 0; i < op_num; ++i) {
    const MT* master_param_data =
-          multi_precision ? master_param[i]->data<MT>() : nullptr;
+        multi_precision ? master_param.get()[i]->data<MT>() : nullptr;
    MT* master_param_out_data =
-          multi_precision
-              ? master_param_out[i]->mutable_data<MT>(ctx.GetPlace())
+        multi_precision ? dev_ctx.template Alloc<MT>(master_param_out[i])
                        : nullptr;
    SeparatedLarsMomentumOpCUDAKernel<T, MT>(
-          cuda_ctx,
+        dev_ctx,
        param[i]->data<T>(),
-          param_out[i]->mutable_data<T>(ctx.GetPlace()),
+        dev_ctx.template Alloc<T>(param_out[i]),
        velocity[i]->data<MT>(),
-          velocity_out[i]->mutable_data<MT>(ctx.GetPlace()),
+        dev_ctx.template Alloc<MT>(velocity_out[i]),
        grad[i]->data<T>(),
        learning_rate[i]->data<MT>(),
        p_buffer,
        g_buffer,
-          mu,
-          lars_coeff,
+        mu_,
+        lars_coeff_,
        weight_decay_arr[i],
-          epsilon,
-          rescale_grad,
+        epsilon_,
+        rescale_grad_,
        param[i]->numel(),
        master_param_data,
        master_param_out_data,
        multi_precision);
  }
 #endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
+}
+}  // namespace phi

-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-PD_REGISTER_STRUCT_KERNEL(lars_momentum,
+PD_REGISTER_KERNEL(lars_momentum,
                   GPU,
                   ALL_LAYOUT,
-                          ops::LarsMomentumOpCUDAKernel,
+                   phi::LarsMomentumKernel,
                   float,
                   double,
-                          plat::float16) {}
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
--- a/paddle/phi/kernels/lars_momentum_kernel.h
+++ b/paddle/phi/kernels/lars_momentum_kernel.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LarsMomentumKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& velocity,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& grad,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const std::vector<float>& weight_decay_arr,
+    float mu,
+    float lars_coeff,
+    float epsilon,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> velocity_out,
+    std::vector<DenseTensor*> master_param_out);
+
+}  // namespace phi
--- a/paddle/phi/ops/compat/lars_momentum_sig.cc
+++ b/paddle/phi/ops/compat/lars_momentum_sig.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LarsMomentumOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "lars_momentum",
+      {"Param", "Velocity", "LearningRate", "Grad", "MasterParam"},
+      {"lars_weight_decay",
+       "mu",
+       "lars_coeff",
+       "epsilon",
+       "multi_precision",
+       "rescale_grad"},
+      {"ParamOut", "VelocityOut", "MasterParamOut"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(lars_momentum, phi::LarsMomentumOpArgumentMapping);
--- a/test/legacy_test/test_momentum_op.py
+++ b/test/legacy_test/test_momentum_op.py
@@ -312,7 +312,7 @@ class TestLarsMomentumOp(OpTest):

    def test_check_output(self):
        paddle.enable_static()
-        self.check_output()
+        self.check_output(check_dygraph=False)

    def config(self):
        self.params_num = 1