From 0e4bcede044effad239e8b6aa8aa2b23dd96487e Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 16 Jul 2021 15:20:24 +0800
Subject: [PATCH] [NPU] add clear_float_status op (#34190)

* add clear_float_status op

* refine infershape

* fix typo

* refine check_finite_and_scale

* refine code
---
 .../amp/check_finite_and_unscale_op_npu.cc    |  4 -
 .../operators/amp/clear_float_status_op.cc    | 77 +++++++++++++++++++
 .../amp/clear_float_status_op_npu.cc          | 54 +++++++++++++
 .../contrib/mixed_precision/decorator.py      |  4 +
 .../test_amp_check_finite_and_scale_op_npu.py | 12 +++
 5 files changed, 147 insertions(+), 4 deletions(-)
 create mode 100644 paddle/fluid/operators/amp/clear_float_status_op.cc
 create mode 100644 paddle/fluid/operators/amp/clear_float_status_op_npu.cc
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 26280cd2bd1..68da8fd5808 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -97,10 +97,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
           NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
       runner_mul.Run(stream);
     }
-
-    const auto& runner_clear_status =
-        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
-    runner_clear_status.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/amp/clear_float_status_op.cc b/paddle/fluid/operators/amp/clear_float_status_op.cc
new file mode 100644
index 00000000000..7a906a51879
--- /dev/null
+++ b/paddle/fluid/operators/amp/clear_float_status_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ClearFloatStatusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("FloatStatusOut"), "Output", "FloatStatusOut",
+                   "clear_float_status");
+    ctx->SetOutputDim("FloatStatusOut", ctx->GetInputDim("FloatStatus"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class ClearFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FloatStatus",
+             "(Tensor) of shape {8} that holds the float status.");
+    AddOutput(
+        "FloatStatusOut",
+        "(Tensor) of shape {8} that holds the float status, which is cleared.");
+    AddComment(R"DOC(
+      Clear the float status
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ClearFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Operator clear_float_status is not supported on CPU"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    clear_float_status, ops::ClearFloatStatusOp, ops::ClearFloatStatusMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(clear_float_status,
+                       ops::ClearFloatStatusKernel<CPU, float>);
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
new file mode 100644
index 00000000000..d5bdcc37c2a
--- /dev/null
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ClearFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
+    auto* float_status_out = ctx.Output<framework::Tensor>("FloatStatusOut");
+    // NOTE(zhiqiu): NPUClearFloatStatus modifies the input.
+    PADDLE_ENFORCE_EQ(float_status_out, float_status,
+                      platform::errors::PreconditionNotMet(
+                          "The input(FloatStatus) and Output(FloatStatusOut) "
+                          "should be the same."));
+    Tensor tmp;
+    tmp.mutable_data<float>({8}, ctx.GetPlace());
+    const auto& runner =
+        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    clear_float_status,
+    ops::ClearFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index d5d2e7a0d96..09b8629a978 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -173,6 +173,10 @@ class OptimizerWithMixedPrecision(object):
             self._train_program.global_block().append_op(
                 type="alloc_float_status",
                 outputs={"FloatStatus": float_status}, )
+            self._train_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
             self._float_status = float_status
         else:
             self._float_status = None
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
index 8828892dca3..e92bfbb4d77 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -40,6 +40,10 @@ class TestCheckFiniteAndUnscale(unittest.TestCase):
             main_program.global_block().append_op(
                 type="alloc_float_status",
                 outputs={"FloatStatus": float_status}, )
+            main_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
             c = paddle.fluid.layers.elementwise_div(a, b)
             out, found_inf = check_finite_and_unscale(
                 [c], scale, float_status=float_status)
@@ -106,12 +110,20 @@ class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
             main_program.global_block().append_op(
                 type="alloc_float_status",
                 outputs={"FloatStatus": float_status}, )
+            main_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
             c = paddle.fluid.layers.elementwise_div(a, b)
             out, found_inf = check_finite_and_unscale(
                 [c], scale, float_status=float_status)
             main_program.global_block().append_op(
                 type="alloc_float_status",
                 outputs={"FloatStatus": float_status}, )
+            main_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
             d = paddle.fluid.layers.elementwise_add(a, b)
             out, found_inf = check_finite_and_unscale(
                 [d], scale, float_status=float_status)
-- 
GitLab