From a119686ce108c4e1880182e423bf0a6f26db759a Mon Sep 17 00:00:00 2001 From: ronnywang Date: Tue, 30 Aug 2022 20:49:53 +0800 Subject: [PATCH] [NPU] fix pool_op, interpolate_op (#45445) * [NPU] fix pool_op, interpolate_op * fix slice_op_npu * fix test_mixed_precision_npu --- paddle/fluid/operators/interpolate_op_npu.cc | 2 + paddle/fluid/operators/pool_op_npu.cc | 7 +- paddle/fluid/operators/slice_op_npu.cc | 19 +++- .../unittests/npu/test_mixed_precision_npu.py | 107 +++++++++++++++++- 4 files changed, 127 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc index a1c1a41d956..3548506eea4 100644 --- a/paddle/fluid/operators/interpolate_op_npu.cc +++ b/paddle/fluid/operators/interpolate_op_npu.cc @@ -25,12 +25,14 @@ using DataLayout = framework::DataLayout; inline static void CheckArgument(const framework::ExecutionContext& ctx) { const std::string interp_method = ctx.Attr("interp_method"); +#if (CANN_VERSION_CODE < 512000) bool align_corners = ctx.Attr("align_corners"); PADDLE_ENFORCE_EQ( align_corners, false, platform::errors::InvalidArgument( "NPU Interpolate Kernel has diff when align_corners is true.")); +#endif PADDLE_ENFORCE_EQ( interp_method, "nearest", diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc index 5821f5c8d78..7e9b0b65113 100644 --- a/paddle/fluid/operators/pool_op_npu.cc +++ b/paddle/fluid/operators/pool_op_npu.cc @@ -77,6 +77,7 @@ class NPUPoolOpKernel : public framework::OpKernel { data_dims, strides, ksize); +#if (CANN_VERSION_CODE < 512000) PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], @@ -91,7 +92,7 @@ class NPUPoolOpKernel : public framework::OpKernel { "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.", ksize[1], std::max(paddings[2], paddings[3]))); - +#endif if (adaptive) { std::string pooling_mode = "AdaptiveAvgPool2d"; if (pooling_type == "max") { @@ -228,7 +229,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel { data_dims, strides, ksize); - +#if (CANN_VERSION_CODE < 512000) PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], @@ -243,7 +244,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel { "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.", ksize[1], std::max(paddings[2], paddings[3]))); - +#endif if (adaptive || (global_pooling && pooling_type == "max")) { PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0], 0, diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index 85d4aacc0b6..9d248bfd7f3 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -130,9 +130,22 @@ class SliceNPUKernel : public framework::OpKernel { UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner( - "SliceD", {*input}, {*out}, {{"offsets", offsets}, {"size", size}}); + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); +#if CANN_VERSION_CODE < 512000 + const auto& runner = + NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, { + "size", + size + }}); +#else + NpuOpRunner runner; + runner.SetType("Slice") + .AddInput(*input) + .AddInput(std::move(offsets)) + .AddInput(std::move(size)) + .AddOutput(*out); +#endif runner.Run(stream); } }; diff --git a/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py index 26a74b7b736..9927316fddc 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py @@ -15,18 +15,121 @@ import unittest import sys import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.contrib.mixed_precision import fp16_utils +import paddle.nn as nn +import paddle.static as static +import numpy as np sys.path.append("..") -import test_mixed_precision paddle.enable_static() -class AMPTestNpu(test_mixed_precision.AMPTest): +class SimpleNet(nn.Layer): + + def __init__(self, input_size, output_size): + super(SimpleNet, self).__init__() + self.linear1 = nn.Linear(input_size, output_size) + self.relu1 = nn.ReLU() + self.linear2 = nn.Linear(input_size, output_size) + self.relu2 = nn.ReLU() + self.linear3 = nn.Linear(input_size, output_size) + + def forward(self, x): + + x = self.linear1(x) + # currently, paddle's relu may hide nan/inf, relu(nan) = 0, relu(inf)= inf + # so, do not use it here. + #x = self.relu1(x) + x = self.linear2(x) + #x = self.relu2(x) + x = self.linear3(x) + + return x + + +class AMPTestNpu(unittest.TestCase): def setUp(self): self.place = paddle.NPUPlace(0) + def net(self): + input_size = 4096 + output_size = 4096 + x = static.data(name='X', shape=[1000, 4096], dtype='float32') + label = static.data(name='Y', shape=[1000, 4096], dtype='float32') + model = SimpleNet(input_size, output_size) # 定义模型 + mse = paddle.nn.MSELoss() + + out = model(x) + loss = mse(out, label) + + opt = paddle.fluid.optimizer.Adam( + learning_rate=0.0001, parameter_list=model.parameters()) # 定义优化器 + opt = paddle.static.amp.decorate(opt, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True) + opt.minimize(loss) + return model, loss, opt + + def test_skip_update(self): + input_size = 4096 + output_size = 4096 + batch_size = 1000 + nums_batch = 10 + startup_prog = paddle.static.Program() + main_prog = paddle.static.Program() + with static.program_guard(main_prog, startup_prog): + model, loss, opt = self.net() + weight = model.linear1.weight + moment1 = opt._optimizer._get_accumulator( + opt._optimizer._moment1_acc_str, weight) + beta_pow1 = opt._optimizer._get_accumulator( + opt._optimizer._beta1_pow_acc_str, weight) + fetch_list = [ + loss, weight, moment1, beta_pow1, 'find_infinite_scale.tmp_0' + ] + + exe = paddle.static.Executor(self.place) + + train_data = [ + np.random.rand(batch_size, input_size).astype(np.float32) + for _ in range(nums_batch) + ] + labels = [ + np.random.rand(batch_size, output_size).astype(np.float32) + for _ in range(nums_batch) + ] + + weight_, moment1_, beta_pow1_ = exe.run( + startup_prog, fetch_list=[weight, moment1, beta_pow1]) + pre_weight_, pre_moment1_, pre_beta_pow1_ = weight_, moment1_, beta_pow1_ + for i in range(nums_batch): + if i % 2: + train_data[i][10] = np.inf + loss_, weight_, moment1_, beta_pow1_, found_inf = exe.run( + main_prog, + feed={ + "X": train_data[i], + "Y": labels[i] + }, + fetch_list=fetch_list) + print(loss_, weight_[0][0], moment1_[0][0], beta_pow1_, + found_inf) + if i % 2: + self.assertTrue(found_inf) + np.testing.assert_array_equal(weight_, pre_weight_) + np.testing.assert_array_equal(moment1_, pre_moment1_) + np.testing.assert_array_equal(beta_pow1_, pre_beta_pow1_) + else: + self.assertFalse(found_inf) + self.assertFalse(np.array_equal(weight_, pre_weight_)) + self.assertFalse(np.array_equal(moment1_, pre_moment1_)) + self.assertFalse(np.array_equal(beta_pow1_, pre_beta_pow1_)) + pre_weight_, pre_moment1_, pre_beta_pow1_ = weight_, moment1_, beta_pow1_ + if __name__ == '__main__': unittest.main() -- GitLab