test_tensor_checker.py 4.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import paddle


class TestTensorChecker(unittest.TestCase):
21
    def _parse_num_nan_inf(self, e):
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
        num_nan = 0
        num_inf = 0
        # Cannot catch the log in CUDA kernel.
        err_str_list = (
            str(e)
            .replace("(", " ")
            .replace(")", " ")
            .replace(",", " ")
            .split(" ")
        )
        for err_str in err_str_list:
            if "num_nan" in err_str:
                num_nan = int(err_str.split("=")[1])
            elif "num_inf" in err_str:
                num_inf = int(err_str.split("=")[1])
37
        return num_nan, num_inf
38

39
    def _generate_num_inf(self, place):
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
        num_inf = 0
        num_nan = 0
        paddle.set_device(place)
        # check op list
        x = paddle.to_tensor(
            [1, 0, 0],
            dtype='float32',
            stop_gradient=False,
        )
        y = paddle.to_tensor([0, 0, 1], dtype='float32')
        try:
            res = paddle.pow(x, y)
            # test backward
            paddle.autograd.backward([res])
            res = paddle.divide(y, x)
        except Exception as e:
56 57
            num_nan, num_inf = self._parse_num_nan_inf(e)
        return num_nan, num_inf
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83

    def test_tensor_checker(self):
        def _assert_flag(value):
            flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
            res = paddle.get_flags(flags)
            assert res["FLAGS_check_nan_inf"] == value

        paddle.set_flags({"FLAGS_check_nan_inf": 0})
        paddle.seed(102)
        checker_config = paddle.amp.debugging.TensorCheckerConfig(
            enable=True,
            debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
            checked_op_list=["elementwise_pow_grad"],
            skipped_op_list=["elementwise_div"],
            debug_step=[0, 3],
        )
        places = ['cpu']
        if paddle.is_compiled_with_cuda():
            places.append('gpu')
        # check seed
        self.assertEqual(checker_config.initial_seed, 102)
        self.assertEqual(checker_config.seed, 102)
        _assert_flag(False)

        for place in places:
            paddle.amp.debugging.TensorCheckerConfig.current_step_id = 0
84
            for iter_id in range(5):
85
                paddle.amp.debugging.enable_tensor_checker(checker_config)
86
                if iter_id <= 2:
87 88
                    _assert_flag(True)
                    self.assertEqual(
89
                        iter_id + 1,
90 91
                        paddle.amp.debugging.TensorCheckerConfig.current_step_id,
                    )
92 93 94 95 96 97 98 99 100
                    num_nan, num_inf = self._generate_num_inf(place)
                    print(
                        f"-- [iter_id={iter_id}, place={place}] num_nan={num_nan}, num_inf={num_inf}"
                    )
                    self.assertEqual(
                        1,
                        num_nan,
                        f"Expected num_nan to be 1, but recieved {num_nan}, place={place}.",
                    )
101 102 103 104 105 106
                else:
                    self.assertEqual(
                        3,
                        paddle.amp.debugging.TensorCheckerConfig.current_step_id,
                    )
                    _assert_flag(False)
107 108 109 110 111 112 113 114 115
                    num_nan, num_inf = self._generate_num_inf(place)
                    print(
                        f"-- [iter_id={iter_id}, place={place}] num_nan={num_nan}, num_inf={num_inf}"
                    )
                    self.assertEqual(
                        0,
                        num_nan,
                        f"Expected num_nan to be 1, but recieved {num_nan}, place={place}.",
                    )
116 117 118 119 120 121 122

                paddle.amp.debugging.disable_tensor_checker()
                _assert_flag(False)


if __name__ == '__main__':
    unittest.main()