[NPU] fix pool_op, interpolate_op (#45445)

* [NPU] fix pool_op, interpolate_op * fix slice_op_npu * fix test_mixed_precision_npu

[NPU] fix pool_op, interpolate_op (#45445)
* [NPU] fix pool_op, interpolate_op * fix slice_op_npu * fix test_mixed_precision_npu
a119686c · ronnywang · GitHub · 45a91158 · a119686c · a119686c
4 changed file
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
@@ -25,12 +25,14 @@ using DataLayout = framework::DataLayout;

 inline static void CheckArgument(const framework::ExecutionContext& ctx) {
  const std::string interp_method = ctx.Attr<std::string>("interp_method");
+#if (CANN_VERSION_CODE < 512000)
  bool align_corners = ctx.Attr<bool>("align_corners");
  PADDLE_ENFORCE_EQ(
      align_corners,
      false,
      platform::errors::InvalidArgument(
          "NPU Interpolate Kernel has diff when align_corners is true."));
+#endif
  PADDLE_ENFORCE_EQ(
      interp_method,
      "nearest",

--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -77,6 +77,7 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
                              data_dims,
                              strides,
                              ksize);
+#if (CANN_VERSION_CODE < 512000)
    PADDLE_ENFORCE_LT(
        std::max(paddings[0], paddings[1]),
        ksize[0],
@@ -91,7 +92,7 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
            ksize[1],
            std::max(paddings[2], paddings[3])));
-
+#endif
    if (adaptive) {
      std::string pooling_mode = "AdaptiveAvgPool2d";
      if (pooling_type == "max") {
@@ -228,7 +229,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
                              data_dims,
                              strides,
                              ksize);
-
+#if (CANN_VERSION_CODE < 512000)
    PADDLE_ENFORCE_LT(
        std::max(paddings[0], paddings[1]),
        ksize[0],
@@ -243,7 +244,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
            ksize[1],
            std::max(paddings[2], paddings[3])));
-
+#endif
    if (adaptive || (global_pooling && pooling_type == "max")) {
      PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0],
                        0,

--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -130,9 +130,22 @@ class SliceNPUKernel : public framework::OpKernel<T> {

    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);

-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner(
-        "SliceD", {*input}, {*out}, {{"offsets", offsets}, {"size", size}});
+    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
+    auto stream = dev_ctx.stream();
+#if CANN_VERSION_CODE < 512000
+    const auto& runner =
+        NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, {
+                                                   "size",
+                                                   size
+                                                 }});
+#else
+    NpuOpRunner runner;
+    runner.SetType("Slice")
+        .AddInput(*input)
+        .AddInput(std::move(offsets))
+        .AddInput(std::move(size))
+        .AddOutput(*out);
+#endif
    runner.Run(stream);
  }
 };

--- a/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py
@@ -15,18 +15,121 @@
 import unittest
 import sys
 import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.contrib.mixed_precision import fp16_utils
+import paddle.nn as nn
+import paddle.static as static
+import numpy as np

 sys.path.append("..")
-import test_mixed_precision

 paddle.enable_static()


-class AMPTestNpu(test_mixed_precision.AMPTest):
+class SimpleNet(nn.Layer):
+
+    def __init__(self, input_size, output_size):
+        super(SimpleNet, self).__init__()
+        self.linear1 = nn.Linear(input_size, output_size)
+        self.relu1 = nn.ReLU()
+        self.linear2 = nn.Linear(input_size, output_size)
+        self.relu2 = nn.ReLU()
+        self.linear3 = nn.Linear(input_size, output_size)
+
+    def forward(self, x):
+
+        x = self.linear1(x)
+        # currently, paddle's relu may hide nan/inf, relu(nan) = 0, relu(inf)= inf
+        # so, do not use it here.
+        #x = self.relu1(x)
+        x = self.linear2(x)
+        #x = self.relu2(x)
+        x = self.linear3(x)
+
+        return x
+
+
+class AMPTestNpu(unittest.TestCase):

    def setUp(self):
        self.place = paddle.NPUPlace(0)

+    def net(self):
+        input_size = 4096
+        output_size = 4096
+        x = static.data(name='X', shape=[1000, 4096], dtype='float32')
+        label = static.data(name='Y', shape=[1000, 4096], dtype='float32')
+        model = SimpleNet(input_size, output_size)  # 定义模型
+        mse = paddle.nn.MSELoss()
+
+        out = model(x)
+        loss = mse(out, label)
+
+        opt = paddle.fluid.optimizer.Adam(
+            learning_rate=0.0001, parameter_list=model.parameters())  # 定义优化器
+        opt = paddle.static.amp.decorate(opt,
+                                         init_loss_scaling=128.0,
+                                         use_dynamic_loss_scaling=True)
+        opt.minimize(loss)
+        return model, loss, opt
+
+    def test_skip_update(self):
+        input_size = 4096
+        output_size = 4096
+        batch_size = 1000
+        nums_batch = 10
+        startup_prog = paddle.static.Program()
+        main_prog = paddle.static.Program()
+        with static.program_guard(main_prog, startup_prog):
+            model, loss, opt = self.net()
+            weight = model.linear1.weight
+            moment1 = opt._optimizer._get_accumulator(
+                opt._optimizer._moment1_acc_str, weight)
+            beta_pow1 = opt._optimizer._get_accumulator(
+                opt._optimizer._beta1_pow_acc_str, weight)
+            fetch_list = [
+                loss, weight, moment1, beta_pow1, 'find_infinite_scale.tmp_0'
+            ]
+
+            exe = paddle.static.Executor(self.place)
+
+            train_data = [
+                np.random.rand(batch_size, input_size).astype(np.float32)
+                for _ in range(nums_batch)
+            ]
+            labels = [
+                np.random.rand(batch_size, output_size).astype(np.float32)
+                for _ in range(nums_batch)
+            ]
+
+            weight_, moment1_, beta_pow1_ = exe.run(
+                startup_prog, fetch_list=[weight, moment1, beta_pow1])
+            pre_weight_, pre_moment1_, pre_beta_pow1_ = weight_, moment1_, beta_pow1_
+            for i in range(nums_batch):
+                if i % 2:
+                    train_data[i][10] = np.inf
+                loss_, weight_, moment1_, beta_pow1_, found_inf = exe.run(
+                    main_prog,
+                    feed={
+                        "X": train_data[i],
+                        "Y": labels[i]
+                    },
+                    fetch_list=fetch_list)
+                print(loss_, weight_[0][0], moment1_[0][0], beta_pow1_,
+                      found_inf)
+                if i % 2:
+                    self.assertTrue(found_inf)
+                    np.testing.assert_array_equal(weight_, pre_weight_)
+                    np.testing.assert_array_equal(moment1_, pre_moment1_)
+                    np.testing.assert_array_equal(beta_pow1_, pre_beta_pow1_)
+                else:
+                    self.assertFalse(found_inf)
+                    self.assertFalse(np.array_equal(weight_, pre_weight_))
+                    self.assertFalse(np.array_equal(moment1_, pre_moment1_))
+                    self.assertFalse(np.array_equal(beta_pow1_, pre_beta_pow1_))
+                pre_weight_, pre_moment1_, pre_beta_pow1_ = weight_, moment1_, beta_pow1_
+

 if __name__ == '__main__':
    unittest.main()