test_nan_inf_dir.py 4.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import unittest

import numpy as np

import paddle


class TestNanInfDirCheckResult(unittest.TestCase):
    def generate_inputs(self, shape, dtype="float32"):
        data = np.random.random(size=shape).astype(dtype)
        # [-10, 10)
        x = (data * 20 - 10) * np.random.randint(
            low=0, high=2, size=shape
        ).astype(dtype)
        y = np.random.randint(low=0, high=2, size=shape).astype(dtype)
        return x, y

    def get_reference_num_nan_inf(self, x):
        out = np.log(x)
        num_nan = np.sum(np.isnan(out))
        num_inf = np.sum(np.isinf(out))
38
        print(f"[reference] num_nan={num_nan}, num_inf={num_inf}")
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
        return num_nan, num_inf

    def get_num_nan_inf(
        self, x_np, use_cuda=True, add_assert=False, pt="nan_inf_log_dir"
    ):
        num_nan = 0
        num_inf = 0
        if add_assert:
            if use_cuda:
                paddle.device.set_device("gpu:0")
            else:
                paddle.device.set_device("cpu")
            x = paddle.to_tensor(x_np)
            out = paddle.log(x)
            sys.stdout.flush()
            if not use_cuda:
                os.path.exists(pt)
                num_nan = 0
                num_inf = 0
                for root, dirs, files in os.walk(pt):
                    for file_name in files:
                        if file_name.startswith('worker_cpu'):
                            file_path = os.path.join(root, file_name)
                            with open(file_path, "rb") as fp:
                                for e in fp:
                                    err_str_list = (
                                        str(e)
                                        .replace("(", " ")
                                        .replace(")", " ")
                                        .replace(",", " ")
                                        .split(" ")
                                    )
                                    for err_str in err_str_list:
                                        if "num_nan" in err_str:
                                            num_nan = int(err_str.split("=")[1])
                                        elif "num_inf" in err_str:
                                            num_inf = int(err_str.split("=")[1])
76
                print(f"[paddle] num_nan={num_nan}, num_inf={num_inf}")
77 78 79 80
        return num_nan, num_inf

    def test_num_nan_inf(self):
        path = "nan_inf_log_dir"
81 82 83 84 85 86 87 88

        checker_config = paddle.amp.debugging.TensorCheckerConfig(
            enable=True,
            debug_mode=paddle.amp.debugging.DebugMode.CHECK_ALL,
            output_dir=path,
        )

        paddle.amp.debugging.enable_tensor_checker(checker_config)
89 90 91 92 93 94 95

        def _check_num_nan_inf(use_cuda):
            shape = [32, 32]
            x_np, _ = self.generate_inputs(shape)
            num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np)
            add_assert = (num_nan_np + num_inf_np) > 0
            num_nan, num_inf = self.get_num_nan_inf(
96 97 98 99
                x_np,
                use_cuda,
                add_assert,
                path,
100 101 102 103 104 105
            )
            if not use_cuda:
                assert num_nan == num_nan_np and num_inf == num_inf_np

        if paddle.fluid.core.is_compiled_with_cuda():
            _check_num_nan_inf(use_cuda=True)
106 107 108
        else:
            _check_num_nan_inf(use_cuda=False)

109 110 111
        x = paddle.to_tensor([2, 3, 4], 'float32')
        y = paddle.to_tensor([1, 5, 2], 'float32')
        z = paddle.add(x, y)
112 113
        path = ""
        paddle.fluid.core.set_nan_inf_debug_path(path)
114
        paddle.amp.debugging.disable_tensor_checker()
115 116 117 118


if __name__ == '__main__':
    unittest.main()