[NPU] add clear_float_status op (#34190)

* add clear_float_status op * refine infershape * fix typo * refine check_finite_and_scale * refine code

[NPU] add clear_float_status op (#34190)
* add clear_float_status op * refine infershape * fix typo * refine check_finite_and_scale * refine code
0e4bcede · Leo Chen · GitHub · 7049af57 · 0e4bcede · 0e4bcede
5 changed file
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -97,10 +97,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
          NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
      runner_mul.Run(stream);
    }
-    const auto& runner_clear_status =
-        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
-    runner_clear_status.Run(stream);
  }
 };

--- a/paddle/fluid/operators/amp/clear_float_status_op.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+class ClearFloatStatusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("FloatStatusOut"), "Output", "FloatStatusOut",
+                   "clear_float_status");
+    ctx->SetOutputDim("FloatStatusOut", ctx->GetInputDim("FloatStatus"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+class ClearFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FloatStatus",
+             "(Tensor) of shape {8} that holds the float status.");
+    AddOutput(
+        "FloatStatusOut",
+        "(Tensor) of shape {8} that holds the float status, which is cleared.");
+    AddComment(R"DOC(
+      Clear the float status
+)DOC");
+  }
+};
+template <typename DeviceContext, typename T>
+class ClearFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Operator clear_float_status is not supported on CPU"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(
+    clear_float_status, ops::ClearFloatStatusOp, ops::ClearFloatStatusMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(clear_float_status,
+                       ops::ClearFloatStatusKernel<CPU, float>);
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class ClearFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
+    auto* float_status_out = ctx.Output<framework::Tensor>("FloatStatusOut");
+    // NOTE(zhiqiu): NPUClearFloatStatus modifies the input.
+    PADDLE_ENFORCE_EQ(float_status_out, float_status,
+                      platform::errors::PreconditionNotMet(
+                          "The input(FloatStatus) and Output(FloatStatusOut) "
+                          "should be the same."));
+    Tensor tmp;
+    tmp.mutable_data<float>({8}, ctx.GetPlace());
+    const auto& runner =
+        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    clear_float_status,
+    ops::ClearFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -173,6 +173,10 @@ class OptimizerWithMixedPrecision(object):
            self._train_program.global_block().append_op(
                type="alloc_float_status",
                outputs={"FloatStatus": float_status}, )
+            self._train_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
            self._float_status = float_status
        else:
            self._float_status = None

--- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -40,6 +40,10 @@ class TestCheckFiniteAndUnscale(unittest.TestCase):
            main_program.global_block().append_op(
                type="alloc_float_status",
                outputs={"FloatStatus": float_status}, )
+            main_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
            c = paddle.fluid.layers.elementwise_div(a, b)
            out, found_inf = check_finite_and_unscale(
                [c], scale, float_status=float_status)
@@ -106,12 +110,20 @@ class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
            main_program.global_block().append_op(
                type="alloc_float_status",
                outputs={"FloatStatus": float_status}, )
+            main_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
            c = paddle.fluid.layers.elementwise_div(a, b)
            out, found_inf = check_finite_and_unscale(
                [c], scale, float_status=float_status)
            main_program.global_block().append_op(
                type="alloc_float_status",
                outputs={"FloatStatus": float_status}, )
+            main_program.global_block().append_op(
+                type="clear_float_status",
+                inputs={"FloatStatus": float_status},
+                outputs={"FloatStatusOut": float_status}, )
            d = paddle.fluid.layers.elementwise_add(a, b)
            out, found_inf = check_finite_and_unscale(
                [d], scale, float_status=float_status)