diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c4deef6f579a64c911b734699d4e9b221fa578a6..fd7fc16bff5651f022b484623243048fbd225b5a 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -264,6 +264,7 @@ function(cc_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (${cc_test_SERIAL}) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) endif() endif() endfunction(cc_test) @@ -328,6 +329,7 @@ function(nv_test TARGET_NAME) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) endif() endif() endfunction(nv_test) @@ -575,7 +577,7 @@ function(py_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} + COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 0c74f62de5c6f5d432ee928945db6dcf385ca209..bd98ed81899440a46415d30b6d74fec2dac4c155 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -20,6 +20,12 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" +DEFINE_bool(init_allocated_mem, false, + "It is a mistake that the values of the memory allocated by " + "BuddyAllocator are always zeroed in some op's implementation. " + "To find this error in time, we use init_allocated_mem to indicate " + "that initializing the allocated memory with a small value " + "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); namespace paddle { @@ -41,6 +47,9 @@ template <> void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); + if (FLAGS_init_allocated_mem) { + memset(p, 0xEF, size); + } VLOG(10) << " pointer=" << p; return p; } @@ -104,6 +113,9 @@ void* Alloc(platform::CUDAPlace place, size_t size) { LOG(WARNING) << "GPU memory used: " << Used(place); platform::SetDeviceId(cur_dev); } + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } return ptr; } @@ -137,6 +149,9 @@ void* Alloc(platform::CUDAPinnedPlace place, LOG(WARNING) << "cudaMallocHost Cannot allocate " << size << " bytes in CUDAPinnedPlace"; } + if (FLAGS_init_allocated_mem) { + memset(ptr, 0xEF, size); + } return ptr; } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 45af83708ea63fc1b6aa86f1e8423bb44b7388a6..3034c1a0875a71421bcba172c16ee32d809df152 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -118,7 +118,8 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb' + 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', + 'init_allocated_mem' ] if core.is_compiled_with_cuda(): read_env_flags += [ diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 21f2037ad408b0a92718c0ea2bae5e8bf563c665..cddf00765f4894126988c794763c34629449e8e6 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -18,6 +18,8 @@ import unittest import paddle.fluid as fluid import time import numpy as np +import math +import sys __all__ = ['TestParallelExecutorBase'] @@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase): print "%.4f Instance per second" % ( (batch_size * iter + 2) / (end - begin)) + avg_last_loss_val = np.array(last_loss).mean() + avg_first_loss_val = np.array(first_loss).mean() + if math.isnan(float(avg_last_loss_val)) or math.isnan( + float(avg_first_loss_val)): + sys.exit("got NaN loss, training failed.") + print first_loss, last_loss # self.assertGreater(first_loss[0], last_loss[0]) return first_loss, last_loss diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 252793944462244539084a288e5259f216359650..9a2733927d38f1a2b1af92fcc12f036158b4d06f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -16,6 +16,8 @@ import paddle.fluid as fluid import numpy as np import unittest import os +import sys +import math def simple_fc_net(): @@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): train_loss, = train_exe.run([loss.name], feed=feed_dict) + avg_test_loss_val = np.array(test_loss).mean() + if math.isnan(float(avg_test_loss_val)): + sys.exit("got NaN loss, testing failed.") + + avg_train_loss_val = np.array(train_loss).mean() + if math.isnan(float(avg_train_loss_val)): + sys.exit("got NaN loss, training failed.") + self.assertTrue( np.allclose( train_loss, test_loss, atol=1e-8),