From be7fcc0bfc4634c166d349d35ccd9e06f7882e2c Mon Sep 17 00:00:00 2001 From: helinwang Date: Wed, 7 Feb 2018 12:16:27 -0800 Subject: [PATCH] long running training tests: fail when got NaN loss (#8169) --- python/paddle/v2/fluid/tests/book/test_fit_a_line.py | 4 ++++ .../v2/fluid/tests/book/test_image_classification_train.py | 4 ++++ python/paddle/v2/fluid/tests/book/test_recognize_digits.py | 4 ++++ python/paddle/v2/fluid/tests/book/test_recommender_system.py | 4 ++++ .../paddle/v2/fluid/tests/book/test_understand_sentiment.py | 4 ++++ python/paddle/v2/fluid/tests/book/test_word2vec.py | 5 +++++ 6 files changed, 25 insertions(+) diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index 27f34b1733..06860a2a46 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -16,6 +16,8 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid import contextlib import unittest +import math +import sys def main(use_cuda): @@ -58,6 +60,8 @@ def main(use_cuda): print(avg_loss_value) if avg_loss_value[0] < 10.0: return + if math.isnan(float(avg_loss_value)): + sys.exit("got NaN loss, training failed.") raise AssertionError("Fit a line cost is too large, {0:2.2}".format( avg_loss_value[0])) diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py index 03b009ebb0..ffbe5bdbd6 100644 --- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py @@ -17,6 +17,8 @@ from __future__ import print_function import paddle.v2 as paddle import paddle.v2.fluid as fluid import contextlib +import math +import sys import numpy import unittest @@ -145,6 +147,8 @@ def train(net_type, use_cuda, save_dirname): loss_t, acc_t = exe.run(program=test_program, feed=feeder.feed(test_data), fetch_list=[avg_cost, acc]) + if math.isnan(float(loss_t)): + sys.exit("got NaN loss, training failed.") acc_list.append(float(acc_t)) avg_loss_list.append(float(loss_t)) break # Use 1 segment for speeding up CI diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py index fb6b1f7192..c3f6877575 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py @@ -18,6 +18,8 @@ import paddle.v2 as paddle import sys import numpy import unittest +import math +import sys def parse_arg(): @@ -148,6 +150,8 @@ def train(nn_type, use_cuda, parallel, save_dirname): 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. format(pass_id, batch_id + 1, float(avg_loss_val), float(acc_val))) + if math.isnan(float(avg_loss_val)): + sys.exit("got NaN loss, training failed.") raise AssertionError("Loss of recognize digits is too large") diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index d4a694e572..9c7ab7d631 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math +import sys import numpy as np import paddle.v2 as paddle import paddle.v2.fluid.core as core @@ -217,6 +219,8 @@ def main(): if out[0] < 6.0: # if avg cost less than 6.0, we think our code is good. exit(0) + if math.isnan(float(out[0])): + sys.exit("got NaN loss, training failed.") main() diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py index 2ba9077a26..9c5cb667ae 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py @@ -16,6 +16,8 @@ import unittest import paddle.v2.fluid as fluid import paddle.v2 as paddle import contextlib +import math +import sys def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, @@ -115,6 +117,8 @@ def main(word_dict, net_method, use_cuda): print("cost=" + str(cost_val) + " acc=" + str(acc_val)) if cost_val < 0.4 and acc_val > 0.8: return + if math.isnan(float(cost_val)): + sys.exit("got NaN loss, training failed.") raise AssertionError("Cost is too large for {0}".format( net_method.__name__)) diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index 766ba9681d..f013d7f155 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -16,6 +16,8 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid import unittest import os +import math +import sys def main(use_cuda, is_sparse, parallel): @@ -112,6 +114,9 @@ def main(use_cuda, is_sparse, parallel): fetch_list=[avg_cost]) if avg_cost_np[0] < 5.0: return + if math.isnan(float(avg_cost_np[0])): + sys.exit("got NaN loss, training failed.") + raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0])) -- GitLab