未验证 提交 6c471ed0 编写于 作者: N niuliling123 提交者: GitHub

Count the number of 0 in the output Tensor (#50981)

上级 49752074
......@@ -174,15 +174,19 @@ __device__ T BlockReduce(T value) {
__device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan,
const int64_t num_inf,
const int64_t num_zero,
int64_t offset,
int64_t* num_nan_ptr,
int64_t* num_inf_ptr) {
int64_t* num_inf_ptr,
int64_t* num_zero_ptr) {
int64_t block_num_nan = BlockReduce<int64_t, 2>(num_nan);
int64_t block_num_inf = BlockReduce<int64_t, 2>(num_inf);
int64_t block_num_zero = BlockReduce<int64_t, 2>(num_zero);
if (threadIdx.x == 0) {
num_nan_ptr[offset] = block_num_nan;
num_inf_ptr[offset] = block_num_inf;
num_zero_ptr[offset] = block_num_zero;
}
}
......@@ -233,6 +237,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
const int64_t numel,
int64_t* block_num_nan_ptr,
int64_t* block_num_inf_ptr,
int64_t* block_num_zero_ptr,
MT* tensor_block_max_ptr,
MT* tensor_block_min_ptr,
MT* tensor_block_mean_ptr) {
......@@ -240,6 +245,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
int64_t num_nan = 0;
int64_t num_inf = 0;
int64_t num_zero = 0;
MT max_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
MT min_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
......@@ -256,10 +262,18 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
} else if (isinf(value)) {
num_inf += 1;
}
if (value == static_cast<MT>(0)) {
num_zero += 1;
}
}
BlockReduceNumNanInfAndWrite(
num_nan, num_inf, blockIdx.x, block_num_nan_ptr, block_num_inf_ptr);
BlockReduceNumNanInfAndWrite(num_nan,
num_inf,
num_zero,
blockIdx.x,
block_num_nan_ptr,
block_num_inf_ptr,
block_num_zero_ptr);
BlockReduceMaxMinAndWrite<MT>(max_value,
min_value,
......@@ -273,6 +287,7 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
template <typename T, typename MT>
__global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
const int64_t* block_num_inf_ptr,
const int64_t* block_num_zero_ptr,
const MT* tensor_block_max_ptr,
const MT* tensor_block_min_ptr,
const MT* tensor_block_mean_ptr,
......@@ -283,11 +298,13 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
if (blockIdx.x == 0 && threadIdx.x == 0) {
int64_t num_nan = 0;
int64_t num_inf = 0;
int64_t num_zero = 0;
// numel_max_min <= 128
for (int64_t i = 0; i < numel_max_min; ++i) {
num_nan += block_num_nan_ptr[i];
num_inf += block_num_inf_ptr[i];
num_zero += block_num_zero_ptr[i];
}
MT max_value = static_cast<MT>(0);
......@@ -314,6 +331,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
numel,
num_nan,
num_inf,
num_zero,
max_value,
min_value,
mean_value,
......@@ -451,11 +469,12 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
int64_t numel_max_min = blocks;
phi::DenseTensor block_num_nan_inf;
block_num_nan_inf.Resize({static_cast<int64_t>(2 * numel_max_min)});
phi::DenseTensor block_num_nan_inf_zero;
block_num_nan_inf_zero.Resize({static_cast<int64_t>(3 * numel_max_min)});
int64_t* block_num_nan_ptr =
dev_ctx->template Alloc<int64_t>(&block_num_nan_inf);
dev_ctx->template Alloc<int64_t>(&block_num_nan_inf_zero);
int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min;
int64_t* block_num_zero_ptr = block_num_inf_ptr + numel_max_min;
phi::DenseTensor tensor_block_max_min;
tensor_block_max_min.Resize({static_cast<int64_t>(3 * numel_max_min)});
......@@ -468,6 +487,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
tensor.numel(),
block_num_nan_ptr,
block_num_inf_ptr,
block_num_zero_ptr,
tensor_block_max_ptr,
tensor_block_min_ptr,
tensor_block_mean_ptr);
......@@ -476,6 +496,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
FindGlobalMaxMinAndPrint<T, MT>
<<<1, 1, 0, dev_ctx->stream()>>>(block_num_nan_ptr,
block_num_inf_ptr,
block_num_zero_ptr,
tensor_block_max_ptr,
tensor_block_min_ptr,
tensor_block_mean_ptr,
......
......@@ -69,6 +69,7 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
int64_t numel,
int64_t num_nan,
int64_t num_inf,
int64_t num_zero,
MT max_value,
MT min_value,
MT mean_value,
......@@ -76,26 +77,31 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
if (num_nan > 0 || num_inf > 0) {
printf(
"[PRECISION] [ERROR] in %s, numel=%lld, num_nan=%lld, "
"num_inf=%lld, max=%e, min=%e, mean=%e\n",
"num_inf=%lld, num_zero=%lld, max=%e, min=%e, mean=%e\n",
debug_info,
static_cast<long long>(numel), // NOLINT
static_cast<long long>(num_nan), // NOLINT
static_cast<long long>(num_inf), // NOLINT
static_cast<long long>(num_zero), // NOLINT
static_cast<float>(max_value),
static_cast<float>(min_value),
static_cast<float>(mean_value));
if (check_nan_inf_level == 0) {
#if defined(__NVCC__) || defined(__HIPCC__)
PADDLE_ENFORCE(false,
"There are NAN or INF (num_nan=%ld, num_inf=%lld) in %s.",
"There are NAN or INF (num_nan=%ld, num_inf=%lld, "
"num_zero=%lld) in %s.",
static_cast<long long>(num_nan), // NOLINT
static_cast<long long>(num_inf), // NOLINT
static_cast<long long>(num_zero), // NOLINT
debug_info);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"There are NAN or INF (num_nan=%lld, num_inf=%lld) in %s.",
"There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
"%s.",
static_cast<long long>(num_nan), // NOLINT
static_cast<long long>(num_inf), // NOLINT
static_cast<long long>(num_zero), // NOLINT
debug_info));
#endif
}
......@@ -114,6 +120,7 @@ void PrintForDifferentLevelFile(const char* debug_info,
int64_t numel,
int64_t num_nan,
int64_t num_inf,
int64_t num_zero,
MT max_value,
MT min_value,
MT mean_value,
......@@ -139,6 +146,7 @@ void PrintForDifferentLevelFile(const char* debug_info,
<< ", numel=" << static_cast<long long>(numel) // NOLINT
<< ", num_nan=" << static_cast<long long>(num_nan) // NOLINT
<< ", num_inf=" << static_cast<long long>(num_inf) // NOLINT
<< ", num_zero=" << static_cast<long long>(num_zero) // NOLINT
<< ", max=" << static_cast<float>(max_value)
<< ", min=" << static_cast<float>(min_value)
<< ", mean=" << static_cast<float>(mean_value) << std::endl;
......@@ -200,6 +208,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
std::vector<int64_t> thread_num_nan(num_threads, 0);
std::vector<int64_t> thread_num_inf(num_threads, 0);
std::vector<int64_t> thread_num_zero(num_threads, 0);
std::vector<MT> thread_min_value(num_threads, static_cast<MT>(value_ptr[0]));
std::vector<MT> thread_max_value(num_threads, static_cast<MT>(value_ptr[0]));
std::vector<MT> thread_mean_value(num_threads, static_cast<MT>(0));
......@@ -230,17 +239,22 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
} else if (std::isinf(value)) {
thread_num_inf[tid] += 1;
}
if (value == 0) {
thread_num_zero[tid] += 1;
}
}
}
int64_t num_nan = 0;
int64_t num_inf = 0;
int64_t num_zero = 0;
MT min_value = thread_min_value[0];
MT max_value = thread_max_value[0];
MT mean_value = static_cast<MT>(0);
for (int i = 0; i < num_threads; ++i) {
num_nan += thread_num_nan[i];
num_inf += thread_num_inf[i];
num_zero += thread_num_zero[i];
min_value = std::min(thread_min_value[i], min_value);
max_value = std::max(thread_max_value[i], max_value);
mean_value += thread_mean_value[i];
......@@ -254,6 +268,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
numel,
num_nan,
num_inf,
num_zero,
max_value,
min_value,
mean_value,
......@@ -266,6 +281,7 @@ static void CheckNanInfCpuImpl(const T* value_ptr,
numel,
num_nan,
num_inf,
num_zero,
max_value,
min_value,
mean_value,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册