Init allocated memory for unit test (#11657)

* memory init * add env * refine anounce * Add check for Nan * Debug * Add env for cc_test * Add env for py_test and nv_test * Remove py_test env * Add env for py_test * serial test_recognize_digits * Test FLAGS_init_allocated_mem function for unit test * Init allocated mem for op unit test * Add env for all unit test

Init allocated memory for unit test (#11657)
* memory init * add env * refine anounce * Add check for Nan * Debug * Add env for cc_test * Add env for py_test and nv_test * Remove py_test env * Add env for py_test * serial test_recognize_digits * Test FLAGS_init_allocated_mem function for unit test * Init allocated mem for op unit test * Add env for all unit test
d2ad4a5c · chengduo · GitHub · 7b54f168 · d2ad4a5c · d2ad4a5c
5 changed file
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -264,6 +264,7 @@ function(cc_test TARGET_NAME)
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    endif()
  endif()
 endfunction(cc_test)
@@ -328,6 +329,7 @@ function(nv_test TARGET_NAME)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    endif()
  endif()
 endfunction(nv_test)
@@ -575,7 +577,7 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -20,6 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+DEFINE_bool(init_allocated_mem, false,
+            "It is a mistake that the values of the memory allocated by "
+            "BuddyAllocator are always zeroed in some op's implementation. "
+            "To find this error in time, we use init_allocated_mem to indicate "
+            "that initializing the allocated memory with a small value "
+            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
@@ -41,6 +47,9 @@ template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
  VLOG(10) << "  pointer=" << p;
  return p;
 }
@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
    platform::SetDeviceId(cur_dev);
  }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
  return ptr;
 }
@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                 << " bytes in CUDAPinnedPlace";
  }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
  return ptr;
 }

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -118,7 +118,8 @@ def __bootstrap__():
    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem'
    ]
    if core.is_compiled_with_cuda():
        read_env_flags += [

--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -18,6 +18,8 @@ import unittest
 import paddle.fluid as fluid
 import time
 import numpy as np
+import math
+import sys
 __all__ = ['TestParallelExecutorBase']
@@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase):
                print "%.4f Instance per second" % (
                    (batch_size * iter + 2) / (end - begin))
+            avg_last_loss_val = np.array(last_loss).mean()
+            avg_first_loss_val = np.array(first_loss).mean()
+            if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                    float(avg_first_loss_val)):
+                sys.exit("got NaN loss, training failed.")
            print first_loss, last_loss
            # self.assertGreater(first_loss[0], last_loss[0])
            return first_loss, last_loss
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -16,6 +16,8 @@ import paddle.fluid as fluid
 import numpy as np
 import unittest
 import os
+import sys
+import math
 def simple_fc_net():
@@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                train_loss, = train_exe.run([loss.name], feed=feed_dict)
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
                self.assertTrue(
                    np.allclose(
                        train_loss, test_loss, atol=1e-8),