未验证 提交 d2ad4a5c 编写于 作者: C chengduo 提交者: GitHub

Init allocated memory for unit test (#11657)

* memory init

* add env

* refine anounce

* Add check for Nan

* Debug

* Add env for cc_test

* Add env for py_test and nv_test

* Remove py_test env

* Add env for py_test

* serial test_recognize_digits

* Test FLAGS_init_allocated_mem function for unit test

* Init allocated mem for op unit test

* Add env for all unit test
上级 7b54f168
......@@ -264,6 +264,7 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif()
endif()
endfunction(cc_test)
......@@ -328,6 +329,7 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif()
endif()
endfunction(nv_test)
......@@ -575,7 +577,7 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......
......@@ -20,6 +20,12 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
......@@ -41,6 +47,9 @@ template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p;
return p;
}
......@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr;
}
......@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
}
......
......@@ -118,7 +118,8 @@ def __bootstrap__():
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem'
]
if core.is_compiled_with_cuda():
read_env_flags += [
......
......@@ -18,6 +18,8 @@ import unittest
import paddle.fluid as fluid
import time
import numpy as np
import math
import sys
__all__ = ['TestParallelExecutorBase']
......@@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase):
print "%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin))
avg_last_loss_val = np.array(last_loss).mean()
avg_first_loss_val = np.array(first_loss).mean()
if math.isnan(float(avg_last_loss_val)) or math.isnan(
float(avg_first_loss_val)):
sys.exit("got NaN loss, training failed.")
print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0])
return first_loss, last_loss
......@@ -16,6 +16,8 @@ import paddle.fluid as fluid
import numpy as np
import unittest
import os
import sys
import math
def simple_fc_net():
......@@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
train_loss, = train_exe.run([loss.name], feed=feed_dict)
avg_test_loss_val = np.array(test_loss).mean()
if math.isnan(float(avg_test_loss_val)):
sys.exit("got NaN loss, testing failed.")
avg_train_loss_val = np.array(train_loss).mean()
if math.isnan(float(avg_train_loss_val)):
sys.exit("got NaN loss, training failed.")
self.assertTrue(
np.allclose(
train_loss, test_loss, atol=1e-8),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册