未验证 提交 6c471ed0 编写于 作者: N niuliling123 提交者: GitHub

Count the number of 0 in the output Tensor (#50981)

上级 49752074
...@@ -174,15 +174,19 @@ __device__ T BlockReduce(T value) { ...@@ -174,15 +174,19 @@ __device__ T BlockReduce(T value) {
__device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan, __device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan,
const int64_t num_inf, const int64_t num_inf,
const int64_t num_zero,
int64_t offset, int64_t offset,
int64_t* num_nan_ptr, int64_t* num_nan_ptr,
int64_t* num_inf_ptr) { int64_t* num_inf_ptr,
int64_t* num_zero_ptr) {
int64_t block_num_nan = BlockReduce<int64_t, 2>(num_nan); int64_t block_num_nan = BlockReduce<int64_t, 2>(num_nan);
int64_t block_num_inf = BlockReduce<int64_t, 2>(num_inf); int64_t block_num_inf = BlockReduce<int64_t, 2>(num_inf);
int64_t block_num_zero = BlockReduce<int64_t, 2>(num_zero);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
num_nan_ptr[offset] = block_num_nan; num_nan_ptr[offset] = block_num_nan;
num_inf_ptr[offset] = block_num_inf; num_inf_ptr[offset] = block_num_inf;
num_zero_ptr[offset] = block_num_zero;
} }
} }
...@@ -233,6 +237,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, ...@@ -233,6 +237,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
const int64_t numel, const int64_t numel,
int64_t* block_num_nan_ptr, int64_t* block_num_nan_ptr,
int64_t* block_num_inf_ptr, int64_t* block_num_inf_ptr,
int64_t* block_num_zero_ptr,
MT* tensor_block_max_ptr, MT* tensor_block_max_ptr,
MT* tensor_block_min_ptr, MT* tensor_block_min_ptr,
MT* tensor_block_mean_ptr) { MT* tensor_block_mean_ptr) {
...@@ -240,6 +245,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, ...@@ -240,6 +245,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
int64_t num_nan = 0; int64_t num_nan = 0;
int64_t num_inf = 0; int64_t num_inf = 0;
int64_t num_zero = 0;
MT max_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]); MT max_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
MT min_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]); MT min_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
...@@ -256,10 +262,18 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, ...@@ -256,10 +262,18 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
} else if (isinf(value)) { } else if (isinf(value)) {
num_inf += 1; num_inf += 1;
} }
if (value == static_cast<MT>(0)) {
num_zero += 1;
}
} }
BlockReduceNumNanInfAndWrite( BlockReduceNumNanInfAndWrite(num_nan,
num_nan, num_inf, blockIdx.x, block_num_nan_ptr, block_num_inf_ptr); num_inf,
num_zero,
blockIdx.x,
block_num_nan_ptr,
block_num_inf_ptr,
block_num_zero_ptr);
BlockReduceMaxMinAndWrite<MT>(max_value, BlockReduceMaxMinAndWrite<MT>(max_value,
min_value, min_value,
...@@ -273,6 +287,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, ...@@ -273,6 +287,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
template <typename T, typename MT> template <typename T, typename MT>
__global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr, __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
const int64_t* block_num_inf_ptr, const int64_t* block_num_inf_ptr,
const int64_t* block_num_zero_ptr,
const MT* tensor_block_max_ptr, const MT* tensor_block_max_ptr,
const MT* tensor_block_min_ptr, const MT* tensor_block_min_ptr,
const MT* tensor_block_mean_ptr, const MT* tensor_block_mean_ptr,
...@@ -283,11 +298,13 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr, ...@@ -283,11 +298,13 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
if (blockIdx.x == 0 && threadIdx.x == 0) { if (blockIdx.x == 0 && threadIdx.x == 0) {
int64_t num_nan = 0; int64_t num_nan = 0;
int64_t num_inf = 0; int64_t num_inf = 0;
int64_t num_zero = 0;
// numel_max_min <= 128 // numel_max_min <= 128
for (int64_t i = 0; i < numel_max_min; ++i) { for (int64_t i = 0; i < numel_max_min; ++i) {
num_nan += block_num_nan_ptr[i]; num_nan += block_num_nan_ptr[i];
num_inf += block_num_inf_ptr[i]; num_inf += block_num_inf_ptr[i];
num_zero += block_num_zero_ptr[i];
} }
MT max_value = static_cast<MT>(0); MT max_value = static_cast<MT>(0);
...@@ -314,6 +331,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr, ...@@ -314,6 +331,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
numel, numel,
num_nan, num_nan,
num_inf, num_inf,
num_zero,
max_value, max_value,
min_value, min_value,
mean_value, mean_value,
...@@ -451,11 +469,12 @@ void TensorCheckerVisitor<phi::GPUContext>::apply( ...@@ -451,11 +469,12 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
int64_t numel_max_min = blocks; int64_t numel_max_min = blocks;
phi::DenseTensor block_num_nan_inf; phi::DenseTensor block_num_nan_inf_zero;
block_num_nan_inf.Resize({static_cast<int64_t>(2 * numel_max_min)}); block_num_nan_inf_zero.Resize({static_cast<int64_t>(3 * numel_max_min)});
int64_t* block_num_nan_ptr = int64_t* block_num_nan_ptr =
dev_ctx->template Alloc<int64_t>(&block_num_nan_inf); dev_ctx->template Alloc<int64_t>(&block_num_nan_inf_zero);
int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min; int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min;
int64_t* block_num_zero_ptr = block_num_inf_ptr + numel_max_min;
phi::DenseTensor tensor_block_max_min; phi::DenseTensor tensor_block_max_min;
tensor_block_max_min.Resize({static_cast<int64_t>(3 * numel_max_min)}); tensor_block_max_min.Resize({static_cast<int64_t>(3 * numel_max_min)});
...@@ -468,6 +487,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply( ...@@ -468,6 +487,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
tensor.numel(), tensor.numel(),
block_num_nan_ptr, block_num_nan_ptr,
block_num_inf_ptr, block_num_inf_ptr,
block_num_zero_ptr,
tensor_block_max_ptr, tensor_block_max_ptr,
tensor_block_min_ptr, tensor_block_min_ptr,
tensor_block_mean_ptr); tensor_block_mean_ptr);
...@@ -476,6 +496,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply( ...@@ -476,6 +496,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
FindGlobalMaxMinAndPrint<T, MT> FindGlobalMaxMinAndPrint<T, MT>
<<<1, 1, 0, dev_ctx->stream()>>>(block_num_nan_ptr, <<<1, 1, 0, dev_ctx->stream()>>>(block_num_nan_ptr,
block_num_inf_ptr, block_num_inf_ptr,
block_num_zero_ptr,
tensor_block_max_ptr, tensor_block_max_ptr,
tensor_block_min_ptr, tensor_block_min_ptr,
tensor_block_mean_ptr, tensor_block_mean_ptr,
......
...@@ -69,6 +69,7 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info, ...@@ -69,6 +69,7 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
int64_t numel, int64_t numel,
int64_t num_nan, int64_t num_nan,
int64_t num_inf, int64_t num_inf,
int64_t num_zero,
MT max_value, MT max_value,
MT min_value, MT min_value,
MT mean_value, MT mean_value,
...@@ -76,26 +77,31 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info, ...@@ -76,26 +77,31 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
if (num_nan > 0 || num_inf > 0) { if (num_nan > 0 || num_inf > 0) {
printf( printf(
"[PRECISION] [ERROR] in %s, numel=%lld, num_nan=%lld, " "[PRECISION] [ERROR] in %s, numel=%lld, num_nan=%lld, "
"num_inf=%lld, max=%e, min=%e, mean=%e\n", "num_inf=%lld, num_zero=%lld, max=%e, min=%e, mean=%e\n",
debug_info, debug_info,
static_cast<long long>(numel), // NOLINT static_cast<long long>(numel), // NOLINT
static_cast<long long>(num_nan), // NOLINT static_cast<long long>(num_nan), // NOLINT
static_cast<long long>(num_inf), // NOLINT static_cast<long long>(num_inf), // NOLINT
static_cast<long long>(num_zero), // NOLINT
static_cast<float>(max_value), static_cast<float>(max_value),
static_cast<float>(min_value), static_cast<float>(min_value),
static_cast<float>(mean_value)); static_cast<float>(mean_value));
if (check_nan_inf_level == 0) { if (check_nan_inf_level == 0) {
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
PADDLE_ENFORCE(false, PADDLE_ENFORCE(false,
"There are NAN or INF (num_nan=%ld, num_inf=%lld) in %s.", "There are NAN or INF (num_nan=%ld, num_inf=%lld, "
static_cast<long long>(num_nan), // NOLINT "num_zero=%lld) in %s.",
static_cast<long long>(num_inf), // NOLINT static_cast<long long>(num_nan), // NOLINT
static_cast<long long>(num_inf), // NOLINT
static_cast<long long>(num_zero), // NOLINT
debug_info); debug_info);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"There are NAN or INF (num_nan=%lld, num_inf=%lld) in %s.", "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
static_cast<long long>(num_nan), // NOLINT "%s.",
static_cast<long long>(num_inf), // NOLINT static_cast<long long>(num_nan), // NOLINT
static_cast<long long>(num_inf), // NOLINT
static_cast<long long>(num_zero), // NOLINT
debug_info)); debug_info));
#endif #endif
} }
...@@ -114,6 +120,7 @@ void PrintForDifferentLevelFile(const char* debug_info, ...@@ -114,6 +120,7 @@ void PrintForDifferentLevelFile(const char* debug_info,
int64_t numel, int64_t numel,
int64_t num_nan, int64_t num_nan,
int64_t num_inf, int64_t num_inf,
int64_t num_zero,
MT max_value, MT max_value,
MT min_value, MT min_value,
MT mean_value, MT mean_value,
...@@ -136,9 +143,10 @@ void PrintForDifferentLevelFile(const char* debug_info, ...@@ -136,9 +143,10 @@ void PrintForDifferentLevelFile(const char* debug_info,
if (num_nan > 0 || num_inf > 0) { if (num_nan > 0 || num_inf > 0) {
outfile << "[PRECISION] [ERROR] in " << debug_info outfile << "[PRECISION] [ERROR] in " << debug_info
<< ", numel=" << static_cast<long long>(numel) // NOLINT << ", numel=" << static_cast<long long>(numel) // NOLINT
<< ", num_nan=" << static_cast<long long>(num_nan) // NOLINT << ", num_nan=" << static_cast<long long>(num_nan) // NOLINT
<< ", num_inf=" << static_cast<long long>(num_inf) // NOLINT << ", num_inf=" << static_cast<long long>(num_inf) // NOLINT
<< ", num_zero=" << static_cast<long long>(num_zero) // NOLINT
<< ", max=" << static_cast<float>(max_value) << ", max=" << static_cast<float>(max_value)
<< ", min=" << static_cast<float>(min_value) << ", min=" << static_cast<float>(min_value)
<< ", mean=" << static_cast<float>(mean_value) << std::endl; << ", mean=" << static_cast<float>(mean_value) << std::endl;
...@@ -200,6 +208,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr, ...@@ -200,6 +208,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
std::vector<int64_t> thread_num_nan(num_threads, 0); std::vector<int64_t> thread_num_nan(num_threads, 0);
std::vector<int64_t> thread_num_inf(num_threads, 0); std::vector<int64_t> thread_num_inf(num_threads, 0);
std::vector<int64_t> thread_num_zero(num_threads, 0);
std::vector<MT> thread_min_value(num_threads, static_cast<MT>(value_ptr[0])); std::vector<MT> thread_min_value(num_threads, static_cast<MT>(value_ptr[0]));
std::vector<MT> thread_max_value(num_threads, static_cast<MT>(value_ptr[0])); std::vector<MT> thread_max_value(num_threads, static_cast<MT>(value_ptr[0]));
std::vector<MT> thread_mean_value(num_threads, static_cast<MT>(0)); std::vector<MT> thread_mean_value(num_threads, static_cast<MT>(0));
...@@ -230,17 +239,22 @@ static void CheckNanInfCpuImpl(const T* value_ptr, ...@@ -230,17 +239,22 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
} else if (std::isinf(value)) { } else if (std::isinf(value)) {
thread_num_inf[tid] += 1; thread_num_inf[tid] += 1;
} }
if (value == 0) {
thread_num_zero[tid] += 1;
}
} }
} }
int64_t num_nan = 0; int64_t num_nan = 0;
int64_t num_inf = 0; int64_t num_inf = 0;
int64_t num_zero = 0;
MT min_value = thread_min_value[0]; MT min_value = thread_min_value[0];
MT max_value = thread_max_value[0]; MT max_value = thread_max_value[0];
MT mean_value = static_cast<MT>(0); MT mean_value = static_cast<MT>(0);
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
num_nan += thread_num_nan[i]; num_nan += thread_num_nan[i];
num_inf += thread_num_inf[i]; num_inf += thread_num_inf[i];
num_zero += thread_num_zero[i];
min_value = std::min(thread_min_value[i], min_value); min_value = std::min(thread_min_value[i], min_value);
max_value = std::max(thread_max_value[i], max_value); max_value = std::max(thread_max_value[i], max_value);
mean_value += thread_mean_value[i]; mean_value += thread_mean_value[i];
...@@ -254,6 +268,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr, ...@@ -254,6 +268,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
numel, numel,
num_nan, num_nan,
num_inf, num_inf,
num_zero,
max_value, max_value,
min_value, min_value,
mean_value, mean_value,
...@@ -266,6 +281,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr, ...@@ -266,6 +281,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
numel, numel,
num_nan, num_nan,
num_inf, num_inf,
num_zero,
max_value, max_value,
min_value, min_value,
mean_value, mean_value,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册