From b4f67757ff40f8dfc5dbcf5c55df2771d3f58d82 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 16 Aug 2022 05:43:37 -0500
Subject: [PATCH] [Phi] Move amp ops into phi (#45079)

* move check finite and unscale kernel into phi

* move infershape into phi

* move update_loss_scaling kernel into phi

* remove original kernels

* move update loss scaling infershape into phi

* add header for xpu and npu

* solve coverage failed

* fix npu test failed

* remove mutable data in cu file

* fix new executor failed

* add valid check for meta tensor output
---
 paddle/fluid/framework/infershape_utils.cc    |  22 ++
 .../framework/new_executor/data_transfer.cc   |  17 +-
 paddle/fluid/framework/operator.cc            |   4 +
 paddle/fluid/imperative/prepared_operator.h   |   4 +
 .../amp/check_finite_and_unscale_op.cc        |  73 +---
 .../amp/check_finite_and_unscale_op.cu        | 172 ---------
 .../amp/check_finite_and_unscale_op.h         |  32 --
 .../amp/check_finite_and_unscale_op_mlu.cc    |   2 +-
 .../amp/check_finite_and_unscale_op_npu.cc    |   2 +-
 .../check_finite_and_unscale_op_npu_test.cc   |   2 +-
 .../amp/check_finite_and_unscale_op_xpu.cc    |   2 +-
 .../operators/amp/update_loss_scaling_op.cc   | 117 +-----
 .../operators/amp/update_loss_scaling_op.cu   | 210 ----------
 .../operators/amp/update_loss_scaling_op.h    | 214 -----------
 .../amp/update_loss_scaling_op_npu.cc         |  31 +-
 .../amp/update_loss_scaling_op_xpu.cc         |   3 +-
 paddle/phi/core/meta_tensor.cc                |  62 ++-
 paddle/phi/infermeta/multiary.cc              | 241 +++++++-----
 paddle/phi/infermeta/multiary.h               |  59 +--
 paddle/phi/kernels/amp_kernel.h               |  48 +++
 paddle/phi/kernels/cpu/amp_kernel.cc          | 115 ++++++
 paddle/phi/kernels/gpu/amp_kernel.cu          | 358 ++++++++++++++++++
 paddle/phi/kernels/impl/amp_kernel_impl.h     | 183 +++++++++
 .../phi/ops/compat/update_loss_scaling_sig.cc |  47 +++
 .../unittests/test_update_loss_scaling_op.py  |   4 +-
 25 files changed, 1053 insertions(+), 971 deletions(-)
 delete mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
 delete mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op.h
 delete mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op.cu
 delete mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op.h
 create mode 100644 paddle/phi/kernels/amp_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/amp_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/amp_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/amp_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/update_loss_scaling_sig.cc
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index eb988d59a2a..debe43fab82 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -127,7 +127,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   const InferShapeContext& ctx_;
 };
 
+static inline void ValidCheck(const phi::MetaTensor& meta_tensor) {
+  PADDLE_ENFORCE_EQ(meta_tensor.initialized(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The current CompatMetaTensor is not initialized."));
+}
+
 int64_t CompatMetaTensor::numel() const {
+  ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET_CONST(Variable*, var_);
     return var->Get<Tensor>().numel();
@@ -138,6 +146,7 @@ int64_t CompatMetaTensor::numel() const {
 }
 
 DDim CompatMetaTensor::dims() const {
+  ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET_CONST(Variable*, var_);
     if (var->IsType<phi::DenseTensor>()) {
@@ -162,6 +171,7 @@ DDim CompatMetaTensor::dims() const {
 }
 
 phi::DataType CompatMetaTensor::dtype() const {
+  ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET_CONST(Variable*, var_);
     if (var->IsType<phi::DenseTensor>()) {
@@ -183,6 +193,7 @@ phi::DataType CompatMetaTensor::dtype() const {
 }
 
 DataLayout CompatMetaTensor::layout() const {
+  ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET_CONST(Variable*, var_);
     if (var->IsType<phi::DenseTensor>()) {
@@ -206,6 +217,7 @@ DataLayout CompatMetaTensor::layout() const {
 }
 
 void CompatMetaTensor::set_dims(const DDim& dims) {
+  ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET(Variable*, var_);
     if (var->IsType<phi::DenseTensor>()) {
@@ -236,6 +248,7 @@ void CompatMetaTensor::set_dims(const DDim& dims) {
 }
 
 void CompatMetaTensor::set_dtype(phi::DataType dtype) {
+  ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET(Variable*, var_);
     if (var->IsType<phi::DenseTensor>()) {
@@ -258,6 +271,7 @@ void CompatMetaTensor::set_dtype(phi::DataType dtype) {
 }
 
 void CompatMetaTensor::set_layout(DataLayout layout) {
+  ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET(Variable*, var_);
     if (var->IsType<phi::DenseTensor>()) {
@@ -281,6 +295,8 @@ void CompatMetaTensor::set_layout(DataLayout layout) {
 }
 
 void CompatMetaTensor::share_lod(const MetaTensor& meta_tensor) {
+  ValidCheck(*this);
+  ValidCheck(meta_tensor);
   if (is_runtime_) {
     auto* var = PADDLE_GET(Variable*, var_);
     if (var->IsType<phi::DenseTensor>()) {
@@ -299,6 +315,8 @@ void CompatMetaTensor::share_lod(const MetaTensor& meta_tensor) {
 }
 
 void CompatMetaTensor::share_dims(const MetaTensor& meta_tensor) {
+  ValidCheck(*this);
+  ValidCheck(meta_tensor);
   set_dims(meta_tensor.dims());
   if (is_runtime_) {
     auto* var = PADDLE_GET(Variable*, var_);
@@ -472,6 +490,10 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               infer_meta_context.EmplaceBackAttr(
                   phi::Scalar(PADDLE_GET_CONST(std::string, attr)));
               break;
+            case framework::proto::AttrType::BOOLEAN:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(bool, attr)));
+              break;
             default:
               PADDLE_THROW(platform::errors::Unimplemented(
                   "Unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 33e50b249ad..32277ed54bb 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -135,21 +135,14 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   bool run_phi_kernel = false;
 
   // check if phi kernel exists
-  auto phi_kernel_map =
-      phi::KernelFactory::Instance().SelectKernelMap(op_with_kernel->Type());
-  if (phi_kernel_map.size() > 0) {
+  if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
+          op_with_kernel->Type())) {
     auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
     VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n";
 
-    // this function is used to construct data transfer op
-    // we expect that it always has a valid phi kernel
-    // so no need to fallback to cpu kernel
-    PADDLE_ENFORCE_EQ(
-        op_with_kernel->PhiKernel()->IsValid(),
-        true,
-        platform::errors::PreconditionNotMet(
-            "the %s op has no valid phi kernel.", op_with_kernel->Type()));
-    run_phi_kernel = true;
+    if (op_with_kernel->PhiKernel()->IsValid()) {
+      run_phi_kernel = true;
+    }
   }
 
   // 3. Execute transfer op and construct OpFuncNode
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b4ef3efb821..4fb7b0e018d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2752,6 +2752,10 @@ void OperatorWithKernel::BuildPhiKernelContext(
               phi_kernel_context->EmplaceBackAttr(std::move(phi::Scalar(
                   PADDLE_GET_CONST(std::string, attr_iter->second))));
               break;
+            case proto::AttrType::BOOLEAN:
+              phi_kernel_context->EmplaceBackAttr(std::move(
+                  phi::Scalar(PADDLE_GET_CONST(bool, attr_iter->second))));
+              break;
             default:
               PADDLE_THROW(platform::errors::Unimplemented(
                   "Unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index c2b23f31d1d..d043b4a5aad 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -420,6 +420,10 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
               kernel_ctx->EmplaceBackAttr(
                   std::move(phi::Scalar(PADDLE_GET_CONST(std::string, attr))));
               break;
+            case framework::proto::AttrType::BOOLEAN:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(PADDLE_GET_CONST(bool, attr))));
+              break;
             default:
               PADDLE_THROW(platform::errors::Unimplemented(
                   "Unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 8fc582c1984..3404209063e 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,23 +28,6 @@ class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
                           const framework::AttributeMap& attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasInputs("X") || ctx->HasOutputs("Out")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->Inputs("X").size(),
-          ctx->Outputs("Out").size(),
-          platform::errors::InvalidArgument(
-              "The input(X) and output(Out) should have same size in "
-              "Operator(check_finite_and_unscale), size of input(X) is %d "
-              "and size of output(Out) is %d.",
-              ctx->Inputs("X").size(),
-              ctx->Outputs("Out").size()));
-      auto x_dims = ctx->GetInputsDim("X");
-      ctx->SetOutputsDim("Out", x_dims);
-    }
-    ctx->SetOutputDim("FoundInfinite", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -91,59 +77,18 @@ Otherwise, FoundInfinite will be 0 (False).
   }
 };
 
-template <typename T>
-class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
-
-    const T* scale_data = scale->data<T>();
-    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-
-    *found_inf_data = false;
-    framework::Tensor is_finite =
-        ctx.AllocateTmpTensor<bool, phi::CPUContext>({1}, dev_ctx);
-    bool* is_finite_data = is_finite.template data<bool>();
-
-    auto& dev = *ctx.template device_context<phi::CPUContext>().eigen_device();
-
-    T inverse_scale = Inverse<T>(*scale_data);
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(dev_ctx.GetPlace());
-      if (!(*found_inf_data)) {
-        framework::TensorIsfinite(*x, &is_finite);
-        *found_inf_data = !(*is_finite_data);
-      }
-      auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-      auto eigen_in = framework::EigenVector<T>::Flatten(*x);
-      if (!(*found_inf_data)) {
-        eigen_out.device(dev) = eigen_in * inverse_scale;
-      } else {
-        eigen_out.device(dev) = eigen_in * static_cast<T>(0);
-      }
-    }
-    return;
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(check_finite_and_unscale,
+                            CheckFiniteAndUnscaleInferShapeFunctor,
+                            PD_INFER_META(phi::CheckFiniteAndUnscaleInferMeta));
 REGISTER_OPERATOR(
     check_finite_and_unscale,
     ops::CheckFiniteAndUnscaleOp,
     ops::CheckFiniteAndUnscaleOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
-                       ops::CheckFiniteAndUnscaleCpuKernel<float>,
-                       ops::CheckFiniteAndUnscaleCpuKernel<double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    CheckFiniteAndUnscaleInferShapeFunctor);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
deleted file mode 100644
index 35b667825af..00000000000
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) {
-  *o = Inverse<T>(*s);
-  *found_inf = false;
-}
-
-template <typename T, typename MT>
-__global__ void CheckFiniteAndUnscale(const T** xs,
-                                      const MT* scale,
-                                      int64_t size,
-                                      int64_t* starts,
-                                      bool* found_inf,
-                                      T** outs) {
-  const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  // copy starts array from global memory to shared memory
-  extern __shared__ int64_t s_starts[];
-  for (int i = threadIdx.x; i <= size; i += blockDim.x) {
-    s_starts[i] = starts[i];
-  }
-  __syncthreads();
-
-  const int64_t num = s_starts[size];
-  int xs_index = 0;
-  bool local_found_inf = false;
-  const MT local_scale = *scale;
-  for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) {
-    // get the "out" index of "id"
-    // For example:
-    // idx = 15, starts = [0, 10, 10, 20, 30]
-    // because 10 <= idx < 20 ==>
-    // the idx element locate in the 3rd tensor (notice the 2nd tensor size is
-    // 0)
-    int next_xs_index = xs_index;
-    while (idx >= s_starts[next_xs_index]) next_xs_index++;
-    xs_index = next_xs_index - 1;
-
-    // get in data and out data
-    const T* in = xs[xs_index];
-    T* out = outs[xs_index];
-    int64_t in_idx = idx - s_starts[xs_index];
-
-    // Unscale
-    MT val = static_cast<MT>(in[in_idx]) * local_scale;
-    T narrow_val = static_cast<T>(val);
-    out[in_idx] = narrow_val;
-
-    // CheckFinite
-    if (!isfinite(narrow_val)) {
-      local_found_inf = true;
-    }
-  }
-  if (local_found_inf) {
-    *found_inf = true;
-  }
-}
-
-template <typename T>
-class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
-
-    const MPDType* scale_data = scale->data<MPDType>();
-    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-
-    framework::Tensor inverse_scale =
-        ctx.AllocateTmpTensor<MPDType, phi::GPUContext>({1}, dev_ctx);
-    MPDType* inverse_scale_v = inverse_scale.template data<MPDType>();
-
-    InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
-        scale_data, inverse_scale_v, found_inf_data);
-
-    size_t xs_size = xs.size();
-    if (xs_size == 0) return;
-
-    const auto& cpu_place = platform::CPUPlace();
-    // calculate each tensor's start index and copy to device
-    auto h_starts_tensor =
-        memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
-    int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
-
-    auto d_starts_tensor =
-        memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
-    int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
-
-    // the start index value of each tensor is
-    // the sum of previous tensor's size. For example:
-    // xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
-    h_starts[0] = 0;
-    for (int i = 1; i <= xs_size; i++) {
-      h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
-    }
-    int64_t total_num = h_starts[xs_size];
-    memory::Copy(dev_ctx.GetPlace(),
-                 d_starts,
-                 cpu_place,
-                 h_starts,
-                 (xs_size + 1) * sizeof(int64_t),
-                 dev_ctx.stream());
-
-    // copy each tensor's data address to device
-    auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
-    const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
-    T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
-
-    auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*));
-    const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
-    T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
-
-    for (size_t i = 0; i < xs_size; ++i) {
-      h_xs[i] = xs[i]->data<T>();
-      h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
-    }
-    memory::Copy(dev_ctx.GetPlace(),
-                 d_xs,
-                 cpu_place,
-                 h_xs,
-                 2 * xs_size * sizeof(T*),
-                 dev_ctx.stream());
-
-    // Launch Kernel
-    int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
-    int elements_per_block =
-        threads_per_block * 20;  // each thread deal with 20 number
-    int blocks_per_grid =
-        (total_num + elements_per_block - 1) / elements_per_block;
-    VLOG(3) << "launch kernel";
-    CheckFiniteAndUnscale<T, MPDType><<<blocks_per_grid,
-                                        threads_per_block,
-                                        (xs_size + 1) * sizeof(int64_t),
-                                        dev_ctx.stream()>>>(
-        d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
-    VLOG(3) << "finish kernel";
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
-                        ops::CheckFiniteAndUnscaleGpuKernel<float>,
-                        ops::CheckFiniteAndUnscaleGpuKernel<double>,
-                        ops::CheckFiniteAndUnscaleGpuKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
deleted file mode 100644
index 31647f71038..00000000000
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-inline HOSTDEVICE T Inverse(T s) {
-  return 1.0 / s;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
index 7659d67400b..41ba11ac046 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 24d55ef103c..98768afa936 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index ba7b7d900f5..cc60476c269 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -33,7 +33,7 @@ namespace p = paddle::platform;
 
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(check_finite_and_unscale);
+USE_OP_ITSELF(check_finite_and_unscale);
 USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
 
 struct InputVars {
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index d904e8f7936..7d46b4c7f76 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index 3bae775d308..c8dc8217ef2 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
-
 #include <cstring>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,55 +28,6 @@ class UpdateLossScalingOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"),
-                   "Input",
-                   "FoundInfinite",
-                   "update_loss_scaling");
-    OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"),
-                   "Input",
-                   "PrevLossScaling",
-                   "update_loss_scaling");
-    OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"),
-                   "Input",
-                   "InGoodSteps",
-                   "update_loss_scaling");
-    OP_INOUT_CHECK(ctx->HasInput("InBadSteps"),
-                   "Input",
-                   "InBadSteps",
-                   "update_loss_scaling");
-    OP_INOUT_CHECK(ctx->HasOutput("LossScaling"),
-                   "Output",
-                   "LossScaling",
-                   "update_loss_scaling");
-    OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"),
-                   "Output",
-                   "OutGoodSteps",
-                   "update_loss_scaling");
-    OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"),
-                   "Output",
-                   "OutBadSteps",
-                   "update_loss_scaling");
-
-    if (ctx->HasInputs("X") || ctx->HasOutputs("Out")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->Inputs("X").size(),
-          ctx->Outputs("Out").size(),
-          platform::errors::InvalidArgument(
-              "The input(X) and output(Out) should have same size in "
-              "Operator(update_loss_scaling), size of input(X) is %d "
-              "and size of output(Out) is %d.",
-              ctx->Inputs("X").size(),
-              ctx->Outputs("Out").size()));
-      auto x_dims = ctx->GetInputsDim("X");
-      ctx->SetOutputsDim("Out", x_dims);
-    }
-
-    ctx->SetOutputDim("LossScaling", {1});
-    ctx->SetOutputDim("OutGoodSteps", {1});
-    ctx->SetOutputDim("OutBadSteps", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -168,72 +120,19 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite.
   }
 };
 
-template <typename T, bool IsFoundInfOnCPU>
-class UpdateLossScalingFunctor<phi::CPUContext, T, IsFoundInfOnCPU> {
- public:
-  void operator()(const phi::CPUContext& ctx,
-                  const bool* found_inf_data,
-                  const T* pre_loss_scaling_data,
-                  const int* good_in_data,
-                  const int* bad_in_data,
-                  const int incr_every_n_steps,
-                  const int decr_every_n_nan_or_inf,
-                  const float incr_ratio,
-                  const float decr_ratio,
-                  T* updated_loss_scaling_data,
-                  int* good_out_data,
-                  int* bad_out_data) const {
-    PADDLE_ENFORCE_EQ(
-        IsFoundInfOnCPU,
-        true,
-        platform::errors::InvalidArgument(
-            "The Input(FoundInfinite) should be on the CPUPlace."));
-    Update<T>(found_inf_data,
-              pre_loss_scaling_data,
-              good_in_data,
-              bad_in_data,
-              incr_every_n_steps,
-              decr_every_n_nan_or_inf,
-              incr_ratio,
-              decr_ratio,
-              updated_loss_scaling_data,
-              good_out_data,
-              bad_out_data);
-  }
-};
-
-template <typename T>
-class LazyZeros<phi::CPUContext, T> {
- public:
-  void operator()(const phi::CPUContext& dev_ctx,
-                  const bool* found_inf_data,
-                  const std::vector<const framework::Tensor*>& xs,
-                  const std::vector<framework::Tensor*>& outs) const {
-    for (size_t i = 0; i < xs.size(); ++i) {
-      auto* out = outs[i];
-      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-      int num = out->numel();
-      if (*found_inf_data) {
-        VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
-        std::memset(out_data, 0, num * sizeof(T));
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 using CPU = phi::CPUContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(update_loss_scaling,
+                            UpdateLossScalingInferShapeFunctor,
+                            PD_INFER_META(phi::UpdateLossScalingInferMeta));
 REGISTER_OPERATOR(
     update_loss_scaling,
     ops::UpdateLossScalingOp,
     ops::UpdateLossScalingOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(update_loss_scaling,
-                       ops::UpdateLossScalingKernel<CPU, float>,
-                       ops::UpdateLossScalingKernel<CPU, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    UpdateLossScalingInferShapeFunctor);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
deleted file mode 100644
index 4c927066892..00000000000
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename FoundNanInfFlagT>
-__global__ void GpuUpdateLossScaling(const FoundNanInfFlagT found_inf_data,
-                                     const T* pre_loss_scaling_data,
-                                     const int* good_in_data,
-                                     const int* bad_in_data,
-                                     const int incr_every_n_steps,
-                                     const int decr_every_n_nan_or_inf,
-                                     const float incr_ratio,
-                                     const float decr_ratio,
-                                     T* updated_loss_scaling_data,
-                                     int* good_out_data,
-                                     int* bad_out_data) {
-  Update<T>(found_inf_data,
-            pre_loss_scaling_data,
-            good_in_data,
-            bad_in_data,
-            incr_every_n_steps,
-            decr_every_n_nan_or_inf,
-            incr_ratio,
-            decr_ratio,
-            updated_loss_scaling_data,
-            good_out_data,
-            bad_out_data);
-}
-
-template <typename T>
-__global__ void FusedFillIf(T** outs,
-                            const size_t xs_size,
-                            const int64_t* starts,
-                            const T value,
-                            const bool* has_inf) {
-  if (!(*has_inf)) return;
-
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  // copy starts array from global memory to shared memory
-  extern __shared__ int64_t s_starts[];
-  for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) {
-    s_starts[i] = starts[i];
-  }
-  __syncthreads();
-
-  const int64_t total_num = s_starts[xs_size];
-  int out_index = 0;
-
-  for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) {
-    // get the "out" index of "id"
-    // For example:
-    // id = 15, starts = [0, 10, 10, 20, 30]
-    // because 10 <= id < 20 ==>
-    // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0)
-    int next_out_index = out_index;
-    while (id >= s_starts[next_out_index]) next_out_index++;
-    out_index = next_out_index - 1;
-
-    // get data pointer and index
-    T* out_data = outs[out_index];
-    int64_t idx = id - s_starts[out_index];
-
-    // set value
-    out_data[idx] = value;
-  }
-}
-
-template <typename T, bool IsFoundInfOnCPU>
-class UpdateLossScalingFunctor<phi::GPUContext, T, IsFoundInfOnCPU> {
- public:
-  void operator()(const phi::GPUContext& dev_ctx,
-                  const bool* found_inf_data,
-                  const T* pre_loss_scaling_data,
-                  const int* good_in_data,
-                  const int* bad_in_data,
-                  const int incr_every_n_steps,
-                  const int decr_every_n_nan_or_inf,
-                  const float incr_ratio,
-                  const float decr_ratio,
-                  T* updated_loss_scaling_data,
-                  int* good_out_data,
-                  int* bad_out_data) const {
-    if (IsFoundInfOnCPU) {
-      GpuUpdateLossScaling<T>
-          <<<1, 1, 0, dev_ctx.stream()>>>(*found_inf_data,
-                                          pre_loss_scaling_data,
-                                          good_in_data,
-                                          bad_in_data,
-                                          incr_every_n_steps,
-                                          decr_every_n_nan_or_inf,
-                                          incr_ratio,
-                                          decr_ratio,
-                                          updated_loss_scaling_data,
-                                          good_out_data,
-                                          bad_out_data);
-    } else {
-      GpuUpdateLossScaling<T>
-          <<<1, 1, 0, dev_ctx.stream()>>>(found_inf_data,
-                                          pre_loss_scaling_data,
-                                          good_in_data,
-                                          bad_in_data,
-                                          incr_every_n_steps,
-                                          decr_every_n_nan_or_inf,
-                                          incr_ratio,
-                                          decr_ratio,
-                                          updated_loss_scaling_data,
-                                          good_out_data,
-                                          bad_out_data);
-    }
-  }
-};
-
-template <typename T>
-class LazyZeros<phi::GPUContext, T> {
- public:
-  void operator()(const phi::GPUContext& dev_ctx,
-                  const bool* found_inf_data,
-                  const std::vector<const framework::Tensor*>& xs,
-                  const std::vector<framework::Tensor*>& outs) const {
-    size_t xs_size = xs.size();
-    if (xs_size == 0) return;
-
-    const auto& cpu_place = platform::CPUPlace();
-    // alloc each tensor's start index and copy to device
-    auto h_in_starts_mem =
-        memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
-    int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
-
-    auto d_in_starts_mem =
-        memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
-    int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
-
-    // the start index value of each tensor is
-    // the sum of previous tensor's size. For example:
-    // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
-    h_starts[0] = 0;
-    for (int i = 0; i < xs_size; i++) {
-      h_starts[i + 1] = h_starts[i] + outs[i]->numel();
-    }
-    memory::Copy(dev_ctx.GetPlace(),
-                 d_starts,
-                 cpu_place,
-                 h_starts,
-                 (xs_size + 1) * sizeof(int64_t),
-                 dev_ctx.stream());
-
-    // copy each tensor of "outs" data address array to device
-    auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*));
-    T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
-
-    auto d_out_addrs_mem = memory::Alloc(dev_ctx, xs_size * sizeof(T*));
-    T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
-
-    for (size_t i = 0; i < xs_size; ++i) {
-      h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
-    }
-    memory::Copy(dev_ctx.GetPlace(),
-                 d_out_addrs,
-                 cpu_place,
-                 h_out_addrs,
-                 xs_size * sizeof(T*),
-                 dev_ctx.stream());
-
-    // launch cuda kernel
-    int64_t total_num = h_starts[xs_size];
-    int64_t threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
-    int64_t elements_per_block =
-        threads_per_block * 50;  // each thread deal with 50 data
-    int64_t blocks_per_grid =
-        (total_num + elements_per_block - 1) / elements_per_block;
-    FusedFillIf<T><<<blocks_per_grid,
-                     threads_per_block,
-                     (xs_size + 1) * sizeof(int64_t),
-                     dev_ctx.stream()>>>(
-        d_out_addrs, xs_size, d_starts, static_cast<T>(0), found_inf_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-using GPU = phi::GPUContext;
-
-REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
-                        ops::UpdateLossScalingKernel<GPU, float>,
-                        ops::UpdateLossScalingKernel<GPU, double>,
-                        ops::UpdateLossScalingKernel<GPU, plat::float16>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
deleted file mode 100644
index 90c77a3a4fd..00000000000
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined(PADDLE_WITH_CUDA) && defined(__NVCC__)
-#include <cuda.h>
-#endif  // PADDLE_WITH_CUDA && __NVCC__
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/errors.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-inline HOSTDEVICE bool check_finite(T value) {
-#if defined(PADDLE_WITH_CUDA) && defined(__NVCC__)
-  return isfinite(value);
-#else
-  return std::isfinite(value);
-#endif
-}
-
-inline HOSTDEVICE bool IsFoundNanInf(const bool found_nan_inf_data) {
-  return found_nan_inf_data;
-}
-
-inline HOSTDEVICE bool IsFoundNanInf(const bool* found_nan_inf_data) {
-  return *found_nan_inf_data;
-}
-
-template <typename T, typename FoundInfFlagT>
-inline HOSTDEVICE void Update(const FoundInfFlagT found_inf_data,
-                              const T* pre_loss_scaling_data,
-                              const int* good_in_data,
-                              const int* bad_in_data,
-                              const int incr_every_n_steps,
-                              const int decr_every_n_nan_or_inf,
-                              const float incr_ratio,
-                              const float decr_ratio,
-                              T* updated_loss_scaling_data,
-                              int* good_out_data,
-                              int* bad_out_data) {
-  if (IsFoundNanInf(found_inf_data)) {
-    *good_out_data = 0;
-    *bad_out_data = *bad_in_data + 1;
-    if (*bad_out_data == decr_every_n_nan_or_inf) {
-      T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
-      *updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
-                                       ? static_cast<T>(1)
-                                       : new_loss_scaling;
-      *bad_out_data = 0;
-    }
-  } else {
-    *bad_out_data = 0;
-    *good_out_data = *good_in_data + 1;
-    if (*good_out_data == incr_every_n_steps) {
-      T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
-      *updated_loss_scaling_data = check_finite(new_loss_scaling)
-                                       ? new_loss_scaling
-                                       : *pre_loss_scaling_data;
-      *good_out_data = 0;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, bool IsFoundInfOnCPU>
-class UpdateLossScalingFunctor {
- public:
-  void operator()(const DeviceContext& dev_ctx,
-                  const bool* found_inf_data,
-                  const T* pre_loss_scaling_data,
-                  const int* good_in_data,
-                  const int* bad_in_data,
-                  const int incr_every_n_steps,
-                  const int decr_every_n_nan_or_inf,
-                  const float incr_ratio,
-                  const float decr_ratio,
-                  T* updated_loss_scaling_data,
-                  int* good_out_data,
-                  int* bad_out_data) const;
-};
-
-template <typename DeviceContext, typename T>
-class LazyZeros {
- public:
-  void operator()(const DeviceContext& dev_ctx,
-                  const bool* found_inf_data,
-                  const std::vector<const framework::Tensor*>& xs,
-                  const std::vector<framework::Tensor*>& outs) const;
-};
-
-template <typename DeviceContext, typename T>
-class UpdateLossScalingKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
-    PADDLE_ENFORCE_EQ(found_inf->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "FoundInfinite must has only one element."));
-    const bool* found_inf_data = found_inf->data<bool>();
-    bool is_found_inf_on_cpu = platform::is_cpu_place(found_inf->place());
-
-    if (is_found_inf_on_cpu) {
-      if (*found_inf_data) {
-        phi::funcs::SetConstant<DeviceContext, T> set_constant;
-        for (auto* out : outs) {
-          out->mutable_data<T>(dev_ctx.GetPlace());
-          set_constant(dev_ctx, out, static_cast<T>(0));
-        }
-      }
-    } else {
-      LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
-    }
-
-    const auto* stop_update_tensor = ctx.Input<Tensor>("StopUpdate");
-    bool stop_update = false;
-    if (stop_update_tensor && stop_update_tensor->IsInitialized()) {
-      if (platform::is_cpu_place(stop_update_tensor->place())) {
-        stop_update = stop_update_tensor->data<bool>()[0];
-      } else {
-        framework::Tensor tmp_tensor;
-        framework::TensorCopySync(
-            *stop_update_tensor, platform::CPUPlace(), &tmp_tensor);
-        stop_update = tmp_tensor.data<bool>()[0];
-      }
-    }
-    stop_update |= ctx.Attr<bool>("stop_update");
-    if (stop_update) {
-      return;
-    }
-
-    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
-    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
-    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
-    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
-    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
-    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
-    const MPDType* pre_loss_scaling_data = pre_loss_scaling->data<MPDType>();
-    const int* good_in_data = good_in->data<int>();
-    const int* bad_in_data = bad_in->data<int>();
-
-    MPDType* updated_loss_scaling_data =
-        updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
-    int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
-    int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
-
-    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
-    const int decr_every_n_nan_or_inf =
-        ctx.Attr<int>("decr_every_n_nan_or_inf");
-    const float incr_ratio = ctx.Attr<float>("incr_ratio");
-    const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    if (is_found_inf_on_cpu) {
-      UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
-          dev_ctx,
-          found_inf_data,
-          pre_loss_scaling_data,
-          good_in_data,
-          bad_in_data,
-          incr_every_n_steps,
-          decr_every_n_nan_or_inf,
-          incr_ratio,
-          decr_ratio,
-          updated_loss_scaling_data,
-          good_out_data,
-          bad_out_data);
-    } else {
-      UpdateLossScalingFunctor<DeviceContext, MPDType, false>{}(
-          dev_ctx,
-          found_inf_data,
-          pre_loss_scaling_data,
-          good_in_data,
-          bad_in_data,
-          incr_every_n_steps,
-          decr_every_n_nan_or_inf,
-          incr_ratio,
-          decr_ratio,
-          updated_loss_scaling_data,
-          good_out_data,
-          bad_out_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 61bbdf4e0fc..24784803f17 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 DECLARE_int32(min_loss_scaling);
@@ -150,9 +150,7 @@ void Update(const platform::NPUDeviceContext& ctx,
 }
 
 template <typename T>
-class UpdateLossScalingFunctor<platform::NPUDeviceContext,
-                               T,
-                               /*IsFoundInfOnCPU=*/true> {
+class UpdateLossScalingFunctor {
  public:
   void operator()(const platform::NPUDeviceContext& dev_ctx,
                   const std::vector<bool> found_inf_vec,
@@ -270,19 +268,18 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
         ctx.Attr<int>("decr_every_n_nan_or_inf");
     const float incr_ratio = ctx.Attr<float>("incr_ratio");
     const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
-        dev_ctx,
-        found_inf_vec,
-        pre_loss_scaling,
-        good_in,
-        bad_in,
-        incr_every_n_steps,
-        decr_every_n_nan_or_inf,
-        incr_ratio,
-        decr_ratio,
-        updated_loss_scaling,
-        good_out,
-        bad_out);
+    UpdateLossScalingFunctor<MPDType>{}(dev_ctx,
+                                        found_inf_vec,
+                                        pre_loss_scaling,
+                                        good_in,
+                                        bad_in,
+                                        incr_every_n_steps,
+                                        decr_every_n_nan_or_inf,
+                                        incr_ratio,
+                                        decr_ratio,
+                                        updated_loss_scaling,
+                                        good_out,
+                                        bad_out);
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
index 1eefe564156..2348c10ef55 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
@@ -19,12 +19,13 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
   using MPDType = typename details::MPTypeTrait<T>::Type;
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index f0cd841235e..9a008e429da 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -25,15 +25,35 @@ limitations under the License. */
 
 namespace phi {
 
-int64_t MetaTensor::numel() const { return tensor_->numel(); }
+static inline void ValidCheck(const MetaTensor& meta_tensor) {
+  PADDLE_ENFORCE_EQ(meta_tensor.initialized(),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The current MetaTensor is not initialized."));
+}
 
-DDim MetaTensor::dims() const { return tensor_->dims(); }
+int64_t MetaTensor::numel() const {
+  ValidCheck(*this);
+  return tensor_->numel();
+}
 
-DataType MetaTensor::dtype() const { return tensor_->dtype(); }
+DDim MetaTensor::dims() const {
+  ValidCheck(*this);
+  return tensor_->dims();
+}
 
-DataLayout MetaTensor::layout() const { return tensor_->layout(); }
+DataType MetaTensor::dtype() const {
+  ValidCheck(*this);
+  return tensor_->dtype();
+}
+
+DataLayout MetaTensor::layout() const {
+  ValidCheck(*this);
+  return tensor_->layout();
+}
 
 void MetaTensor::set_dims(const DDim& dims) {
+  ValidCheck(*this);
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->dims =
         dims;
@@ -51,6 +71,7 @@ void MetaTensor::set_dims(const DDim& dims) {
 }
 
 void MetaTensor::set_dtype(DataType dtype) {
+  ValidCheck(*this);
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
         ->dtype = dtype;
@@ -67,6 +88,7 @@ void MetaTensor::set_dtype(DataType dtype) {
 }
 
 void MetaTensor::set_layout(DataLayout layout) {
+  ValidCheck(*this);
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
         ->layout = layout;
@@ -83,6 +105,8 @@ void MetaTensor::set_layout(DataLayout layout) {
 }
 
 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
+  ValidCheck(*this);
+  ValidCheck(meta_tensor);
   if (meta_tensor.lod().size() == 0) {
     // no need share
     return;
@@ -101,18 +125,8 @@ void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
   }
 }
 
-const LoD& MetaTensor::lod() const {
-  if (phi::DenseTensor::classof(tensor_)) {
-    return static_cast<DenseTensor*>(tensor_)->lod();
-  } else if (phi::SelectedRows::classof(tensor_)) {
-    return static_cast<SelectedRows*>(tensor_)->value().lod();
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented("Unsupported getting lod of `%s`.",
-                                            tensor_->type_info().name()));
-  }
-}
-
 void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
+  ValidCheck(*this);
   if (phi::DenseTensor::classof(tensor_) ||
       phi::SelectedRows::classof(tensor_)) {
     share_dims(meta_tensor);
@@ -125,9 +139,8 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
-TensorBase* MetaTensor::tensor() const { return tensor_; }
-
 void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
+  ValidCheck(*this);
   bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
   bool is_selected_rows = phi::SelectedRows::classof(tensor_);
   if (is_dense_tensor || is_selected_rows) {
@@ -152,4 +165,19 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
 
 bool MetaTensor::initialized() const { return tensor_ != nullptr; }
 
+// Private Member Methods
+
+const LoD& MetaTensor::lod() const {
+  if (phi::DenseTensor::classof(tensor_)) {
+    return static_cast<DenseTensor*>(tensor_)->lod();
+  } else if (phi::SelectedRows::classof(tensor_)) {
+    return static_cast<SelectedRows*>(tensor_)->value().lod();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented("Unsupported getting lod of `%s`.",
+                                            tensor_->type_info().name()));
+  }
+}
+
+TensorBase* MetaTensor::tensor() const { return tensor_; }
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 27c3d20d853..12c5ba109ab 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -764,6 +764,27 @@ void BroadcastTensorsInferMeta(const std::vector<const MetaTensor*>& x,
   }
 }
 
+void CheckFiniteAndUnscaleInferMeta(const std::vector<const MetaTensor*>& xs,
+                                    const MetaTensor& scale,
+                                    std::vector<MetaTensor*> outs,
+                                    MetaTensor* found_infinite) {
+  PADDLE_ENFORCE_EQ(
+      xs.size(),
+      outs.size(),
+      phi::errors::InvalidArgument(
+          "The input(X) and output(Out) should have same size in "
+          "Operator(check_finite_and_unscale), size of input(X) is %d "
+          "and size of output(Out) is %d.",
+          xs.size(),
+          outs.size()));
+  for (size_t i = 0; i < xs.size(); ++i) {
+    outs[i]->set_dims(xs[i]->dims());
+    outs[i]->set_dtype(xs[i]->dtype());
+  }
+  found_infinite->set_dims({1});
+  found_infinite->set_dtype(DataType::BOOL);
+}
+
 void ConcatInferMeta(const std::vector<const MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
@@ -1109,6 +1130,102 @@ void GenerateProposalsV2InferMeta(const MetaTensor& scores,
   rpn_roi_probs->set_dims(phi::make_ddim({-1, 1}));
 }
 
+void GraphReindexInferMeta(const MetaTensor& x,
+                           const MetaTensor& neighbors,
+                           const MetaTensor& count,
+                           const MetaTensor& hashtable_value,
+                           const MetaTensor& hashtable_index,
+                           bool flag_buffer_hashtable,
+                           MetaTensor* reindex_src,
+                           MetaTensor* reindex_dst,
+                           MetaTensor* out_nodes) {
+  auto GraphReindexShapeCheck = [](const phi::DDim& dims,
+                                   std::string tensor_name) {
+    if (dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          dims[1],
+          1,
+          phi::errors::InvalidArgument("The last dim of %s should be 1 when it "
+                                       "is 2D, but we get %d",
+                                       tensor_name,
+                                       dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          dims.size(),
+          1,
+          phi::errors::InvalidArgument(
+              "The %s should be 1D, when it is not 2D, but we get %d",
+              tensor_name,
+              dims.size()));
+    }
+  };
+
+  GraphReindexShapeCheck(x.dims(), "X");
+  GraphReindexShapeCheck(neighbors.dims(), "Neighbors");
+  GraphReindexShapeCheck(count.dims(), "Count");
+  if (flag_buffer_hashtable) {
+    GraphReindexShapeCheck(hashtable_value.dims(), "HashTable_Value");
+    GraphReindexShapeCheck(hashtable_index.dims(), "HashTable_Index");
+  }
+
+  reindex_src->set_dims({-1});
+  reindex_src->set_dtype(neighbors.dtype());
+  reindex_dst->set_dims({-1});
+  reindex_dst->set_dtype(neighbors.dtype());
+  out_nodes->set_dims({-1});
+  out_nodes->set_dtype(x.dtype());
+}
+
+void GraphSampleNeighborsInferMeta(const MetaTensor& row,
+                                   const MetaTensor& col_ptr,
+                                   const MetaTensor& x,
+                                   const MetaTensor& eids,
+                                   const MetaTensor& perm_buffer,
+                                   int sample_size,
+                                   bool return_eids,
+                                   bool flag_perm_buffer,
+                                   MetaTensor* out,
+                                   MetaTensor* out_count,
+                                   MetaTensor* out_eids) {
+  // GSN: GraphSampleNeighbors
+  auto GSNShapeCheck = [](const phi::DDim& dims, std::string tensor_name) {
+    if (dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          dims[1],
+          1,
+          phi::errors::InvalidArgument("The last dim of %s should be 1 when it "
+                                       "is 2D, but we get %d",
+                                       tensor_name,
+                                       dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          dims.size(),
+          1,
+          phi::errors::InvalidArgument(
+              "The %s should be 1D, when it is not 2D, but we get %d",
+              tensor_name,
+              dims.size()));
+    }
+  };
+
+  GSNShapeCheck(row.dims(), "Row");
+  GSNShapeCheck(col_ptr.dims(), "Col_Ptr");
+  GSNShapeCheck(x.dims(), "X");
+  if (return_eids) {
+    GSNShapeCheck(eids.dims(), "Eids");
+    out_eids->set_dims({-1});
+    out_eids->set_dtype(row.dtype());
+  }
+  if (flag_perm_buffer) {
+    GSNShapeCheck(perm_buffer.dims(), "Perm_Buffer");
+  }
+
+  out->set_dims({-1});
+  out->set_dtype(row.dtype());
+  out_count->set_dims({-1});
+  out_count->set_dtype(DataType::INT32);
+}
+
 void HierarchicalSigmoidInferMeta(const MetaTensor& x,
                                   const MetaTensor& w,
                                   const MetaTensor& label,
@@ -2294,6 +2411,34 @@ void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
   }
 }
 
+void UpdateLossScalingInferMeta(const std::vector<const MetaTensor*>& xs,
+                                const MetaTensor& found_infinite,
+                                const MetaTensor& prev_loss_scaling,
+                                const MetaTensor& in_good_steps,
+                                const MetaTensor& in_bad_steps,
+                                std::vector<MetaTensor*> outs,
+                                MetaTensor* loss_scaling,
+                                MetaTensor* out_good_steps,
+                                MetaTensor* out_bad_steps) {
+  PADDLE_ENFORCE_EQ(xs.size(),
+                    outs.size(),
+                    phi::errors::InvalidArgument(
+                        "The input(X) and output(Out) should have same size in "
+                        "Operator(update_loss_scaling), size of input(X) is %d "
+                        "and size of output(Out) is %d.",
+                        xs.size(),
+                        outs.size()));
+  for (size_t i = 0; i < xs.size(); ++i) {
+    outs[i]->set_dims(xs[i]->dims());
+    outs[i]->set_dtype(xs[i]->dtype());
+  }
+  loss_scaling->set_dims({1});
+  out_good_steps->set_dims({1});
+  out_good_steps->set_dtype(DataType::INT32);
+  out_bad_steps->set_dims({1});
+  out_bad_steps->set_dtype(DataType::INT32);
+}
+
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
                       const MetaTensor& logits_length,
@@ -2356,102 +2501,6 @@ void WhereInferMeta(const MetaTensor& condition,
   out->share_meta(x);
 }
 
-void GraphReindexInferMeta(const MetaTensor& x,
-                           const MetaTensor& neighbors,
-                           const MetaTensor& count,
-                           const MetaTensor& hashtable_value,
-                           const MetaTensor& hashtable_index,
-                           bool flag_buffer_hashtable,
-                           MetaTensor* reindex_src,
-                           MetaTensor* reindex_dst,
-                           MetaTensor* out_nodes) {
-  auto GraphReindexShapeCheck = [](const phi::DDim& dims,
-                                   std::string tensor_name) {
-    if (dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          dims[1],
-          1,
-          phi::errors::InvalidArgument("The last dim of %s should be 1 when it "
-                                       "is 2D, but we get %d",
-                                       tensor_name,
-                                       dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dims.size(),
-          1,
-          phi::errors::InvalidArgument(
-              "The %s should be 1D, when it is not 2D, but we get %d",
-              tensor_name,
-              dims.size()));
-    }
-  };
-
-  GraphReindexShapeCheck(x.dims(), "X");
-  GraphReindexShapeCheck(neighbors.dims(), "Neighbors");
-  GraphReindexShapeCheck(count.dims(), "Count");
-  if (flag_buffer_hashtable) {
-    GraphReindexShapeCheck(hashtable_value.dims(), "HashTable_Value");
-    GraphReindexShapeCheck(hashtable_index.dims(), "HashTable_Index");
-  }
-
-  reindex_src->set_dims({-1});
-  reindex_src->set_dtype(neighbors.dtype());
-  reindex_dst->set_dims({-1});
-  reindex_dst->set_dtype(neighbors.dtype());
-  out_nodes->set_dims({-1});
-  out_nodes->set_dtype(x.dtype());
-}
-
-void GraphSampleNeighborsInferMeta(const MetaTensor& row,
-                                   const MetaTensor& col_ptr,
-                                   const MetaTensor& x,
-                                   const MetaTensor& eids,
-                                   const MetaTensor& perm_buffer,
-                                   int sample_size,
-                                   bool return_eids,
-                                   bool flag_perm_buffer,
-                                   MetaTensor* out,
-                                   MetaTensor* out_count,
-                                   MetaTensor* out_eids) {
-  // GSN: GraphSampleNeighbors
-  auto GSNShapeCheck = [](const phi::DDim& dims, std::string tensor_name) {
-    if (dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          dims[1],
-          1,
-          phi::errors::InvalidArgument("The last dim of %s should be 1 when it "
-                                       "is 2D, but we get %d",
-                                       tensor_name,
-                                       dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dims.size(),
-          1,
-          phi::errors::InvalidArgument(
-              "The %s should be 1D, when it is not 2D, but we get %d",
-              tensor_name,
-              dims.size()));
-    }
-  };
-
-  GSNShapeCheck(row.dims(), "Row");
-  GSNShapeCheck(col_ptr.dims(), "Col_Ptr");
-  GSNShapeCheck(x.dims(), "X");
-  if (return_eids) {
-    GSNShapeCheck(eids.dims(), "Eids");
-    out_eids->set_dims({-1});
-    out_eids->set_dtype(row.dtype());
-  }
-  if (flag_perm_buffer) {
-    GSNShapeCheck(perm_buffer.dims(), "Perm_Buffer");
-  }
-
-  out->set_dims({-1});
-  out->set_dtype(row.dtype());
-  out_count->set_dims({-1});
-  out_count->set_dtype(DataType::INT32);
-}
-
 void Yolov3LossInferMeta(const MetaTensor& x,
                          const MetaTensor& gt_box,
                          const MetaTensor& gt_label,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 05c834cebf3..b01a23b9ee3 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -196,6 +196,11 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
 void BroadcastTensorsInferMeta(const std::vector<const MetaTensor*>& x,
                                std::vector<MetaTensor*> out);
 
+void CheckFiniteAndUnscaleInferMeta(const std::vector<const MetaTensor*>& xs,
+                                    const MetaTensor& scale,
+                                    std::vector<MetaTensor*> outs,
+                                    MetaTensor* found_infinite);
+
 void ConcatInferMeta(const std::vector<const MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
@@ -237,6 +242,28 @@ void GenerateProposalsV2InferMeta(const MetaTensor& scores,
                                   MetaTensor* rpn_roi_probs,
                                   MetaTensor* rpn_rois_num);
 
+void GraphReindexInferMeta(const MetaTensor& x,
+                           const MetaTensor& neighbors,
+                           const MetaTensor& count,
+                           const MetaTensor& hashtable_value,
+                           const MetaTensor& hashtable_index,
+                           bool flag_buffer_hashtable,
+                           MetaTensor* reindex_src,
+                           MetaTensor* reindex_dst,
+                           MetaTensor* out_nodes);
+
+void GraphSampleNeighborsInferMeta(const MetaTensor& row,
+                                   const MetaTensor& col_ptr,
+                                   const MetaTensor& x,
+                                   const MetaTensor& eids,
+                                   const MetaTensor& perm_buffer,
+                                   int sample_size,
+                                   bool return_eids,
+                                   bool flag_perm_buffer,
+                                   MetaTensor* out,
+                                   MetaTensor* out_count,
+                                   MetaTensor* out_eids);
+
 void HierarchicalSigmoidInferMeta(const MetaTensor& x,
                                   const MetaTensor& w,
                                   const MetaTensor& label,
@@ -415,6 +442,16 @@ void StackInferMeta(const std::vector<const MetaTensor*>& x,
 void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
                              std::vector<MetaTensor*> out);
 
+void UpdateLossScalingInferMeta(const std::vector<const MetaTensor*>& xs,
+                                const MetaTensor& found_infinite,
+                                const MetaTensor& prev_loss_scaling,
+                                const MetaTensor& in_good_steps,
+                                const MetaTensor& in_bad_steps,
+                                std::vector<MetaTensor*> outs,
+                                MetaTensor* loss_scaling,
+                                MetaTensor* out_good_steps,
+                                MetaTensor* out_bad_steps);
+
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
                       const MetaTensor& logits_length,
@@ -429,28 +466,6 @@ void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& y,
                     MetaTensor* out);
 
-void GraphReindexInferMeta(const MetaTensor& x,
-                           const MetaTensor& neighbors,
-                           const MetaTensor& count,
-                           const MetaTensor& hashtable_value,
-                           const MetaTensor& hashtable_index,
-                           bool flag_buffer_hashtable,
-                           MetaTensor* reindex_src,
-                           MetaTensor* reindex_dst,
-                           MetaTensor* out_nodes);
-
-void GraphSampleNeighborsInferMeta(const MetaTensor& row,
-                                   const MetaTensor& col_ptr,
-                                   const MetaTensor& x,
-                                   const MetaTensor& eids,
-                                   const MetaTensor& perm_buffer,
-                                   int sample_size,
-                                   bool return_eids,
-                                   bool flag_perm_buffer,
-                                   MetaTensor* out,
-                                   MetaTensor* out_count,
-                                   MetaTensor* out_eids);
-
 void Yolov3LossInferMeta(const MetaTensor& x,
                          const MetaTensor& gt_box,
                          const MetaTensor& gt_label,
diff --git a/paddle/phi/kernels/amp_kernel.h b/paddle/phi/kernels/amp_kernel.h
new file mode 100644
index 00000000000..e22a730db67
--- /dev/null
+++ b/paddle/phi/kernels/amp_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
+                                 const std::vector<const DenseTensor*>& xs,
+                                 const DenseTensor& scale,
+                                 std::vector<DenseTensor*> outs,
+                                 DenseTensor* found_infinite);
+
+template <typename T, typename Context>
+void UpdateLossScalingKernel(const Context& dev_ctx,
+                             const std::vector<const DenseTensor*>& xs,
+                             const DenseTensor& found_infinite,
+                             const DenseTensor& prev_loss_scaling,
+                             const DenseTensor& in_good_steps,
+                             const DenseTensor& in_bad_steps,
+                             int incr_every_n_steps,
+                             int decr_every_n_nan_or_inf,
+                             float incr_ratio,
+                             float decr_ratio,
+                             const Scalar& stop_update,
+                             std::vector<DenseTensor*> outs,
+                             DenseTensor* loss_scaling,
+                             DenseTensor* out_good_steps,
+                             DenseTensor* out_bad_steps);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/amp_kernel.cc b/paddle/phi/kernels/cpu/amp_kernel.cc
new file mode 100644
index 00000000000..d15c91dc8dd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/amp_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/amp_kernel.h"
+
+#include <cmath>
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/impl/amp_kernel_impl.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+
+// Utils
+
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<phi::CPUContext, T, IsFoundInfOnCPU> {
+ public:
+  void operator()(const phi::CPUContext& ctx,
+                  const bool* found_inf_data,
+                  const T* pre_loss_scaling_data,
+                  const int* good_in_data,
+                  const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf,
+                  const float incr_ratio,
+                  const float decr_ratio,
+                  T* updated_loss_scaling_data,
+                  int* good_out_data,
+                  int* bad_out_data) const {
+    PADDLE_ENFORCE_EQ(
+        IsFoundInfOnCPU,
+        true,
+        phi::errors::InvalidArgument(
+            "The Input(FoundInfinite) should be on the CPUPlace."));
+    Update<T>(found_inf_data,
+              pre_loss_scaling_data,
+              good_in_data,
+              bad_in_data,
+              incr_every_n_steps,
+              decr_every_n_nan_or_inf,
+              incr_ratio,
+              decr_ratio,
+              updated_loss_scaling_data,
+              good_out_data,
+              bad_out_data);
+  }
+};
+
+// Kernels
+
+template <typename T, typename Context>
+void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
+                                 const std::vector<const DenseTensor*>& xs,
+                                 const DenseTensor& scale,
+                                 std::vector<DenseTensor*> outs,
+                                 DenseTensor* found_infinite) {
+  const T* scale_data = scale.data<T>();
+  bool* found_inf_data = dev_ctx.template Alloc<bool>(found_infinite);
+
+  *found_inf_data = false;
+  DenseTensor is_finite = Empty<bool>(dev_ctx, {1});
+  bool* is_finite_data = is_finite.template data<bool>();
+
+  auto& dev = *dev_ctx.eigen_device();
+
+  T inverse_scale = 1.0 / *scale_data;
+  for (size_t i = 0; i < xs.size(); ++i) {
+    const auto* x = xs[i];
+    auto* out = outs[i];
+    dev_ctx.template Alloc<T>(out);
+    if (!(*found_inf_data)) {
+      paddle::framework::TensorIsfinite(*x, &is_finite);
+      *found_inf_data = !(*is_finite_data);
+    }
+    auto eigen_out = EigenVector<T>::Flatten(*out);
+    auto eigen_in = EigenVector<T>::Flatten(*x);
+    if (!(*found_inf_data)) {
+      eigen_out.device(dev) = eigen_in * inverse_scale;
+    } else {
+      eigen_out.device(dev) = eigen_in * static_cast<T>(0);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(check_finite_and_unscale,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CheckFiniteAndUnscaleKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(update_loss_scaling,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UpdateLossScalingKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
new file mode 100644
index 00000000000..b0b0f9c5e79
--- /dev/null
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -0,0 +1,358 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/amp_kernel.h"
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/impl/amp_kernel_impl.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+// Utils
+
+template <typename T>
+__global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) {
+  *o = 1.0 / *s;
+  *found_inf = false;
+}
+
+template <typename T, typename MT>
+__global__ void CheckFiniteAndUnscale(const T** xs,
+                                      const MT* scale,
+                                      int64_t size,
+                                      int64_t* starts,
+                                      bool* found_inf,
+                                      T** outs) {
+  const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // copy starts array from global memory to shared memory
+  extern __shared__ int64_t s_starts[];
+  for (int i = threadIdx.x; i <= size; i += blockDim.x) {
+    s_starts[i] = starts[i];
+  }
+  __syncthreads();
+
+  const int64_t num = s_starts[size];
+  int xs_index = 0;
+  bool local_found_inf = false;
+  const MT local_scale = *scale;
+  for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) {
+    // get the "out" index of "id"
+    // For example:
+    // idx = 15, starts = [0, 10, 10, 20, 30]
+    // because 10 <= idx < 20 ==>
+    // the idx element locate in the 3rd tensor (notice the 2nd tensor size is
+    // 0)
+    int next_xs_index = xs_index;
+    while (idx >= s_starts[next_xs_index]) next_xs_index++;
+    xs_index = next_xs_index - 1;
+
+    // get in data and out data
+    const T* in = xs[xs_index];
+    T* out = outs[xs_index];
+    int64_t in_idx = idx - s_starts[xs_index];
+
+    // Unscale
+    MT val = static_cast<MT>(in[in_idx]) * local_scale;
+    T narrow_val = static_cast<T>(val);
+    out[in_idx] = narrow_val;
+
+    // CheckFinite
+    if (!isfinite(narrow_val)) {
+      local_found_inf = true;
+    }
+  }
+  if (local_found_inf) {
+    *found_inf = true;
+  }
+}
+
+template <typename T, typename FoundNanInfFlagT>
+__global__ void GpuUpdateLossScaling(const FoundNanInfFlagT found_inf_data,
+                                     const T* pre_loss_scaling_data,
+                                     const int* good_in_data,
+                                     const int* bad_in_data,
+                                     const int incr_every_n_steps,
+                                     const int decr_every_n_nan_or_inf,
+                                     const float incr_ratio,
+                                     const float decr_ratio,
+                                     T* updated_loss_scaling_data,
+                                     int* good_out_data,
+                                     int* bad_out_data) {
+  Update<T>(found_inf_data,
+            pre_loss_scaling_data,
+            good_in_data,
+            bad_in_data,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            updated_loss_scaling_data,
+            good_out_data,
+            bad_out_data);
+}
+
+template <typename T>
+__global__ void FusedFillIf(T** outs,
+                            const size_t xs_size,
+                            const int64_t* starts,
+                            const T value,
+                            const bool* has_inf) {
+  if (!(*has_inf)) return;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // copy starts array from global memory to shared memory
+  extern __shared__ int64_t s_starts[];
+  for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) {
+    s_starts[i] = starts[i];
+  }
+  __syncthreads();
+
+  const int64_t total_num = s_starts[xs_size];
+  int out_index = 0;
+
+  for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) {
+    // get the "out" index of "id"
+    // For example:
+    // id = 15, starts = [0, 10, 10, 20, 30]
+    // because 10 <= id < 20 ==>
+    // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0)
+    int next_out_index = out_index;
+    while (id >= s_starts[next_out_index]) next_out_index++;
+    out_index = next_out_index - 1;
+
+    // get data pointer and index
+    T* out_data = outs[out_index];
+    int64_t idx = id - s_starts[out_index];
+
+    // set value
+    out_data[idx] = value;
+  }
+}
+
+template <typename T>
+class LazyZeros<phi::GPUContext, T> {
+ public:
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const DenseTensor*>& xs,
+                  const std::vector<DenseTensor*>& outs) {
+    size_t xs_size = xs.size();
+    if (xs_size == 0) return;
+
+    const auto& cpu_place = phi::CPUPlace();
+    // alloc each tensor's start index and copy to device
+    auto h_in_starts_mem =
+        paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+    int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
+
+    auto d_in_starts_mem =
+        paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+    int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
+
+    // the start index value of each tensor is
+    // the sum of previous tensor's size. For example:
+    // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
+    h_starts[0] = 0;
+    for (int i = 0; i < xs_size; i++) {
+      h_starts[i + 1] = h_starts[i] + outs[i]->numel();
+    }
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         d_starts,
+                         cpu_place,
+                         h_starts,
+                         (xs_size + 1) * sizeof(int64_t),
+                         dev_ctx.stream());
+
+    // copy each tensor of "outs" data address array to device
+    auto h_out_addrs_mem =
+        paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*));
+    T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
+
+    auto d_out_addrs_mem = paddle::memory::Alloc(dev_ctx, xs_size * sizeof(T*));
+    T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
+
+    for (size_t i = 0; i < xs_size; ++i) {
+      h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]);
+    }
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         d_out_addrs,
+                         cpu_place,
+                         h_out_addrs,
+                         xs_size * sizeof(T*),
+                         dev_ctx.stream());
+
+    // launch cuda kernel
+    int64_t total_num = h_starts[xs_size];
+    int64_t threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
+    int64_t elements_per_block =
+        threads_per_block * 50;  // each thread deal with 50 data
+    int64_t blocks_per_grid =
+        (total_num + elements_per_block - 1) / elements_per_block;
+    FusedFillIf<T><<<blocks_per_grid,
+                     threads_per_block,
+                     (xs_size + 1) * sizeof(int64_t),
+                     dev_ctx.stream()>>>(
+        d_out_addrs, xs_size, d_starts, static_cast<T>(0), found_inf_data);
+  }
+};
+
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<phi::GPUContext, T, IsFoundInfOnCPU> {
+ public:
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const T* pre_loss_scaling_data,
+                  const int* good_in_data,
+                  const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf,
+                  const float incr_ratio,
+                  const float decr_ratio,
+                  T* updated_loss_scaling_data,
+                  int* good_out_data,
+                  int* bad_out_data) const {
+    if (IsFoundInfOnCPU) {
+      GpuUpdateLossScaling<T>
+          <<<1, 1, 0, dev_ctx.stream()>>>(*found_inf_data,
+                                          pre_loss_scaling_data,
+                                          good_in_data,
+                                          bad_in_data,
+                                          incr_every_n_steps,
+                                          decr_every_n_nan_or_inf,
+                                          incr_ratio,
+                                          decr_ratio,
+                                          updated_loss_scaling_data,
+                                          good_out_data,
+                                          bad_out_data);
+    } else {
+      GpuUpdateLossScaling<T>
+          <<<1, 1, 0, dev_ctx.stream()>>>(found_inf_data,
+                                          pre_loss_scaling_data,
+                                          good_in_data,
+                                          bad_in_data,
+                                          incr_every_n_steps,
+                                          decr_every_n_nan_or_inf,
+                                          incr_ratio,
+                                          decr_ratio,
+                                          updated_loss_scaling_data,
+                                          good_out_data,
+                                          bad_out_data);
+    }
+  }
+};
+
+// Kernels
+
+template <typename T, typename Context>
+void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
+                                 const std::vector<const DenseTensor*>& xs,
+                                 const DenseTensor& scale,
+                                 std::vector<DenseTensor*> outs,
+                                 DenseTensor* found_infinite) {
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  const MPDType* scale_data = scale.data<MPDType>();
+  bool* found_inf_data = dev_ctx.template Alloc<bool>(found_infinite);
+
+  DenseTensor inverse_scale = Empty<MPDType>(dev_ctx, {1});
+  MPDType* inverse_scale_v = inverse_scale.template data<MPDType>();
+
+  InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
+      scale_data, inverse_scale_v, found_inf_data);
+
+  size_t xs_size = xs.size();
+  if (xs_size == 0) return;
+
+  const auto& cpu_place = phi::CPUPlace();
+  // calculate each tensor's start index and copy to device
+  auto h_starts_tensor =
+      paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+  int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
+
+  auto d_starts_tensor =
+      paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+  int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
+
+  // the start index value of each tensor is
+  // the sum of previous tensor's size. For example:
+  // x = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
+  h_starts[0] = 0;
+  for (int i = 1; i <= xs_size; i++) {
+    h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
+  }
+  int64_t total_num = h_starts[xs_size];
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       d_starts,
+                       cpu_place,
+                       h_starts,
+                       (xs_size + 1) * sizeof(int64_t),
+                       dev_ctx.stream());
+
+  // copy each tensor's data address to device
+  auto h_mem = paddle::memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
+  const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
+  T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
+
+  auto d_mem = paddle::memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*));
+  const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
+  T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
+
+  for (size_t i = 0; i < xs_size; ++i) {
+    h_xs[i] = xs[i]->data<T>();
+    h_outs[i] = dev_ctx.template Alloc<T>(outs[i]);
+  }
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       d_xs,
+                       cpu_place,
+                       h_xs,
+                       2 * xs_size * sizeof(T*),
+                       dev_ctx.stream());
+
+  // Launch Kernel
+  int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
+  int elements_per_block =
+      threads_per_block * 20;  // each thread deal with 20 number
+  int blocks_per_grid =
+      (total_num + elements_per_block - 1) / elements_per_block;
+  CheckFiniteAndUnscale<T, MPDType><<<blocks_per_grid,
+                                      threads_per_block,
+                                      (xs_size + 1) * sizeof(int64_t),
+                                      dev_ctx.stream()>>>(
+      d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(check_finite_and_unscale,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CheckFiniteAndUnscaleKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(update_loss_scaling,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UpdateLossScalingKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/amp_kernel_impl.h b/paddle/phi/kernels/impl/amp_kernel_impl.h
new file mode 100644
index 00000000000..4684a924cf0
--- /dev/null
+++ b/paddle/phi/kernels/impl/amp_kernel_impl.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/amp_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T>
+inline HOSTDEVICE bool CheckFinite(T value) {
+#if defined(PADDLE_WITH_CUDA) && defined(__NVCC__)
+  return isfinite(value);
+#else
+  return std::isfinite(value);
+#endif
+}
+
+inline HOSTDEVICE bool IsFoundNanInf(const bool found_nan_inf_data) {
+  return found_nan_inf_data;
+}
+
+inline HOSTDEVICE bool IsFoundNanInf(const bool* found_nan_inf_data) {
+  return *found_nan_inf_data;
+}
+
+template <typename T, typename FoundInfFlagT>
+inline HOSTDEVICE void Update(const FoundInfFlagT found_inf_data,
+                              const T* pre_loss_scaling_data,
+                              const int* good_in_data,
+                              const int* bad_in_data,
+                              const int incr_every_n_steps,
+                              const int decr_every_n_nan_or_inf,
+                              const float incr_ratio,
+                              const float decr_ratio,
+                              T* updated_loss_scaling_data,
+                              int* good_out_data,
+                              int* bad_out_data) {
+  if (IsFoundNanInf(found_inf_data)) {
+    *good_out_data = 0;
+    *bad_out_data = *bad_in_data + 1;
+    if (*bad_out_data == decr_every_n_nan_or_inf) {
+      T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
+      *updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
+                                       ? static_cast<T>(1)
+                                       : new_loss_scaling;
+      *bad_out_data = 0;
+    }
+  } else {
+    *bad_out_data = 0;
+    *good_out_data = *good_in_data + 1;
+    if (*good_out_data == incr_every_n_steps) {
+      T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
+      *updated_loss_scaling_data = CheckFinite(new_loss_scaling)
+                                       ? new_loss_scaling
+                                       : *pre_loss_scaling_data;
+      *good_out_data = 0;
+    }
+  }
+}
+
+template <typename Context, typename T>
+class LazyZeros {
+ public:
+  void operator()(const DeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const DenseTensor*>& xs,
+                  const std::vector<DenseTensor*>& outs) const {}
+};
+
+template <typename Context, typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor {
+ public:
+  void operator()(const DeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const T* pre_loss_scaling_data,
+                  const int* good_in_data,
+                  const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf,
+                  const float incr_ratio,
+                  const float decr_ratio,
+                  T* updated_loss_scaling_data,
+                  int* good_out_data,
+                  int* bad_out_data) const;
+};
+
+template <typename T, typename Context>
+void UpdateLossScalingKernel(const Context& dev_ctx,
+                             const std::vector<const DenseTensor*>& xs,
+                             const DenseTensor& found_infinite,
+                             const DenseTensor& prev_loss_scaling,
+                             const DenseTensor& in_good_steps,
+                             const DenseTensor& in_bad_steps,
+                             int incr_every_n_steps,
+                             int decr_every_n_nan_or_inf,
+                             float incr_ratio,
+                             float decr_ratio,
+                             const Scalar& stop_update,
+                             std::vector<DenseTensor*> outs,
+                             DenseTensor* loss_scaling,
+                             DenseTensor* out_good_steps,
+                             DenseTensor* out_bad_steps) {
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  PADDLE_ENFORCE_EQ(
+      found_infinite.numel(),
+      1,
+      phi::errors::InvalidArgument("FoundInfinite must has only one element."));
+  const bool* found_inf_data = found_infinite.data<bool>();
+  bool is_found_inf_on_cpu =
+      found_infinite.place().GetType() == AllocationType::CPU;
+
+  if (is_found_inf_on_cpu) {
+    if (*found_inf_data) {
+      for (auto* out : outs) {
+        Full<T>(dev_ctx, vectorize(out->dims()), static_cast<T>(0), out);
+      }
+    }
+  } else {
+    LazyZeros<Context, T>{}(dev_ctx, found_inf_data, xs, outs);
+  }
+
+  auto stop_update_val = stop_update.to<bool>();
+  if (stop_update_val) {
+    return;
+  }
+
+  const MPDType* pre_loss_scaling_data = prev_loss_scaling.data<MPDType>();
+  const int* good_in_data = in_good_steps.data<int>();
+  const int* bad_in_data = in_bad_steps.data<int>();
+
+  MPDType* updated_loss_scaling_data =
+      dev_ctx.template Alloc<MPDType>(loss_scaling);
+  int* good_out_data = dev_ctx.template Alloc<int>(out_good_steps);
+  int* bad_out_data = dev_ctx.template Alloc<int>(out_bad_steps);
+
+  if (is_found_inf_on_cpu) {
+    UpdateLossScalingFunctor<Context, MPDType, true>{}(
+        dev_ctx,
+        found_inf_data,
+        pre_loss_scaling_data,
+        good_in_data,
+        bad_in_data,
+        incr_every_n_steps,
+        decr_every_n_nan_or_inf,
+        incr_ratio,
+        decr_ratio,
+        updated_loss_scaling_data,
+        good_out_data,
+        bad_out_data);
+  } else {
+    UpdateLossScalingFunctor<Context, MPDType, false>{}(
+        dev_ctx,
+        found_inf_data,
+        pre_loss_scaling_data,
+        good_in_data,
+        bad_in_data,
+        incr_every_n_steps,
+        decr_every_n_nan_or_inf,
+        incr_ratio,
+        decr_ratio,
+        updated_loss_scaling_data,
+        good_out_data,
+        bad_out_data);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/update_loss_scaling_sig.cc b/paddle/phi/ops/compat/update_loss_scaling_sig.cc
new file mode 100644
index 00000000000..8223d0c7dfd
--- /dev/null
+++ b/paddle/phi/ops/compat/update_loss_scaling_sig.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UpdateLossScalingOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("StopUpdate")) {
+    return KernelSignature(
+        "update_loss_scaling",
+        {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"},
+        {"incr_every_n_steps",
+         "decr_every_n_nan_or_inf",
+         "incr_ratio",
+         "decr_ratio",
+         "StopUpdate"},
+        {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"});
+  } else {
+    return KernelSignature(
+        "update_loss_scaling",
+        {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"},
+        {"incr_every_n_steps",
+         "decr_every_n_nan_or_inf",
+         "incr_ratio",
+         "decr_ratio",
+         "stop_update"},
+        {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(update_loss_scaling,
+                           phi::UpdateLossScalingOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
index 0d31dad8199..7337d320e7d 100644
--- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -50,6 +50,7 @@ class TestUpdateLossScalingOp(OpTest):
         self.num_good_steps = np.array([999], dtype=np.int32)
         self.num_bad_steps = np.array([1], dtype=np.int32)
         self.zero_steps = np.array([0], dtype=np.int32)
+        self.stop_update = np.array([False], dtype=np.bool)
         self.attrs = {
             'incr_every_n_steps': 1000,
             'decr_every_n_nan_or_inf': 2,
@@ -77,7 +78,8 @@ class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
             'FoundInfinite': found_inf,
             'PrevLossScaling': self.prev_loss_scaling,
             'InGoodSteps': self.num_good_steps,
-            'InBadSteps': self.num_bad_steps
+            'InBadSteps': self.num_bad_steps,
+            'StopUpdate': self.stop_update
         }
 
         self.outputs = {
-- 
GitLab