提交 be7fcc0b 编写于 作者: H helinwang 提交者: Abhinav Arora

long running training tests: fail when got NaN loss (#8169)

上级 8e5bc804
...@@ -16,6 +16,8 @@ import paddle.v2 as paddle ...@@ -16,6 +16,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import contextlib import contextlib
import unittest import unittest
import math
import sys
def main(use_cuda): def main(use_cuda):
...@@ -58,6 +60,8 @@ def main(use_cuda): ...@@ -58,6 +60,8 @@ def main(use_cuda):
print(avg_loss_value) print(avg_loss_value)
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
return return
if math.isnan(float(avg_loss_value)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Fit a line cost is too large, {0:2.2}".format( raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
avg_loss_value[0])) avg_loss_value[0]))
......
...@@ -17,6 +17,8 @@ from __future__ import print_function ...@@ -17,6 +17,8 @@ from __future__ import print_function
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import contextlib import contextlib
import math
import sys
import numpy import numpy
import unittest import unittest
...@@ -145,6 +147,8 @@ def train(net_type, use_cuda, save_dirname): ...@@ -145,6 +147,8 @@ def train(net_type, use_cuda, save_dirname):
loss_t, acc_t = exe.run(program=test_program, loss_t, acc_t = exe.run(program=test_program,
feed=feeder.feed(test_data), feed=feeder.feed(test_data),
fetch_list=[avg_cost, acc]) fetch_list=[avg_cost, acc])
if math.isnan(float(loss_t)):
sys.exit("got NaN loss, training failed.")
acc_list.append(float(acc_t)) acc_list.append(float(acc_t))
avg_loss_list.append(float(loss_t)) avg_loss_list.append(float(loss_t))
break # Use 1 segment for speeding up CI break # Use 1 segment for speeding up CI
......
...@@ -18,6 +18,8 @@ import paddle.v2 as paddle ...@@ -18,6 +18,8 @@ import paddle.v2 as paddle
import sys import sys
import numpy import numpy
import unittest import unittest
import math
import sys
def parse_arg(): def parse_arg():
...@@ -148,6 +150,8 @@ def train(nn_type, use_cuda, parallel, save_dirname): ...@@ -148,6 +150,8 @@ def train(nn_type, use_cuda, parallel, save_dirname):
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1, format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val))) float(avg_loss_val), float(acc_val)))
if math.isnan(float(avg_loss_val)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Loss of recognize digits is too large") raise AssertionError("Loss of recognize digits is too large")
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import math
import sys
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
...@@ -217,6 +219,8 @@ def main(): ...@@ -217,6 +219,8 @@ def main():
if out[0] < 6.0: if out[0] < 6.0:
# if avg cost less than 6.0, we think our code is good. # if avg cost less than 6.0, we think our code is good.
exit(0) exit(0)
if math.isnan(float(out[0])):
sys.exit("got NaN loss, training failed.")
main() main()
...@@ -16,6 +16,8 @@ import unittest ...@@ -16,6 +16,8 @@ import unittest
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2 as paddle import paddle.v2 as paddle
import contextlib import contextlib
import math
import sys
def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
...@@ -115,6 +117,8 @@ def main(word_dict, net_method, use_cuda): ...@@ -115,6 +117,8 @@ def main(word_dict, net_method, use_cuda):
print("cost=" + str(cost_val) + " acc=" + str(acc_val)) print("cost=" + str(cost_val) + " acc=" + str(acc_val))
if cost_val < 0.4 and acc_val > 0.8: if cost_val < 0.4 and acc_val > 0.8:
return return
if math.isnan(float(cost_val)):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Cost is too large for {0}".format( raise AssertionError("Cost is too large for {0}".format(
net_method.__name__)) net_method.__name__))
......
...@@ -16,6 +16,8 @@ import paddle.v2 as paddle ...@@ -16,6 +16,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import unittest import unittest
import os import os
import math
import sys
def main(use_cuda, is_sparse, parallel): def main(use_cuda, is_sparse, parallel):
...@@ -112,6 +114,9 @@ def main(use_cuda, is_sparse, parallel): ...@@ -112,6 +114,9 @@ def main(use_cuda, is_sparse, parallel):
fetch_list=[avg_cost]) fetch_list=[avg_cost])
if avg_cost_np[0] < 5.0: if avg_cost_np[0] < 5.0:
return return
if math.isnan(float(avg_cost_np[0])):
sys.exit("got NaN loss, training failed.")
raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0])) raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册