From 280d74219a7f3e2fd7d71ff1f9091bc8eec5081b Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Thu, 2 Sep 2021 10:35:59 +0800 Subject: [PATCH] [npu] add update_loss_scaling npu min value (#35270) --- .../amp/update_loss_scaling_op_npu.cc | 23 ++++-- paddle/fluid/platform/flags.cc | 1 + .../pybind/global_value_getter_setter.cc | 3 + python/paddle/fluid/__init__.py | 1 + .../test_update_loss_scaling_min_op_npu.py | 76 +++++++++++++++++++ 5 files changed, 96 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index 6db18c46a09..0046440429f 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -19,6 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/npu_op_runner.h" +DECLARE_int32(min_loss_scaling); + namespace paddle { namespace operators { @@ -49,7 +51,7 @@ void Update(const platform::NPUDeviceContext& ctx, std::vector bad_out_data; TensorToVector(*bad_out_tensor, ctx, &bad_out_data); - if (bad_out_data[0] == decr_every_n_nan_or_inf) { + if (bad_out_data[0] >= decr_every_n_nan_or_inf) { const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, {*updated_loss_scaling_tensor}, {{"power", static_cast(1)}, @@ -60,13 +62,18 @@ void Update(const platform::NPUDeviceContext& ctx, std::vector new_loss_scaling; TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); - if (new_loss_scaling[0] < static_cast(1)) { + float min_value = 1.0; + if (FLAGS_min_loss_scaling > 1) { + min_value = static_cast(FLAGS_min_loss_scaling); + } + + if (new_loss_scaling[0] < min_value) { // updated_loss_scaling_data = 1 - const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(0)}, - {"shift", static_cast(1)}}); + const auto& runner_p4 = NpuOpRunner( + "Power", {*pre_loss_scaling_tensor}, {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(min_value)}}); runner_p4.Run(stream); } @@ -93,7 +100,7 @@ void Update(const platform::NPUDeviceContext& ctx, std::vector good_out_data; TensorToVector(*good_out_tensor, ctx, &good_out_data); - if (good_out_data[0] == incr_every_n_steps) { + if (good_out_data[0] >= incr_every_n_steps) { const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, {*updated_loss_scaling_tensor}, {{"power", static_cast(1)}, diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index f18eab32465..0274a2cea8e 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -100,6 +100,7 @@ DEFINE_string( npu_config_path, "", "The absolute path of configuration json file, like: /tmp/config.json. " "If proveided, it will be passed to aclInit()."); +DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!"); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index 4824a34e843..dd45443a041 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -98,6 +98,8 @@ DECLARE_string(selected_xpus); #ifdef PADDLE_WITH_ASCEND_CL // device management DECLARE_string(selected_npus); +// set minmum loss scaling value +DECLARE_int32(min_loss_scaling); #endif #ifdef PADDLE_WITH_DISTRIBUTE @@ -385,6 +387,7 @@ static void RegisterGlobalVarGetterSetter() { #ifdef PADDLE_WITH_ASCEND_CL REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_min_loss_scaling); #endif #ifdef PADDLE_WITH_DITRIBUTE diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8bb4d82b724..3fe7f90a5b3 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -249,6 +249,7 @@ def __bootstrap__(): 'npu_config_path', 'get_host_by_name_time', 'hccl_check_nan', + 'min_loss_scaling', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py new file mode 100644 index 00000000000..18e2db7f6b1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py @@ -0,0 +1,76 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import os +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn +from test_update_loss_scaling_op_npu import TestUpdateLossScalingOpBad + +paddle.enable_static() +SEED = 2021 + + +class TestUpdateLossScalingOpMinLossScalingBad(TestUpdateLossScalingOpBad): + def setUp(self): + self.set_npu() + self.op_type = "update_loss_scaling" + self.place = paddle.NPUPlace(0) + + self.init() + fluid.core.globals()['FLAGS_min_loss_scaling'] = 1639 + found_inf = np.array([True], dtype=np.bool) + x = np.random.random((1024, 1024)).astype(self.dtype) + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + x[i[0]][j[0]] = np.inf + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', np.zeros_like(x))], + 'LossScaling': np.array([1639.0]).astype(self.dtype), + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def init(self): + self.incr_ratio = 2.0 + self.decr_ratio = 0.8 + self.dtype = np.float32 + self.prev_loss_scaling = np.array([2048]).astype(self.dtype) + self.num_good_steps = np.array([999], dtype=np.int32) + self.num_bad_steps = np.array([1], dtype=np.int32) + self.zero_steps = np.array([0], dtype=np.int32) + self.attrs = { + 'incr_every_n_steps': 1000, + 'decr_every_n_nan_or_inf': 2, + 'incr_ratio': self.incr_ratio, + 'decr_ratio': self.decr_ratio, + } + + +if __name__ == '__main__': + unittest.main() -- GitLab