未验证 提交 b5a16dca 编写于 作者: Q qingqing01 提交者: GitHub

Fix a critical bug in softmax_with_cross_entropy_op backward. (#9120)

* Fix a critical bug in softmax_with_cross_entropy_op, which will lead to the wrong gradients.

* Enhance unit testing.
上级 1e4c504e
...@@ -23,21 +23,21 @@ using Tensor = framework::Tensor; ...@@ -23,21 +23,21 @@ using Tensor = framework::Tensor;
namespace { namespace {
template <typename T> template <typename T>
__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad, __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
const int64_t* labels, const int batch_size, const int batch_size, const int class_num) {
const int class_num) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
int tid = blockIdx.x * blockDim.x + threadIdx.x; i += blockDim.x * gridDim.x) {
int sample_idx = tid / class_num; int idx = i * class_num + labels[i];
logit_grad[idx] -= static_cast<T>(1.);
if (tid < batch_size) {
PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
} }
}
__syncthreads(); template <typename T>
__global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
if (tid < batch_size * class_num) { const int class_num) {
logit_grad[tid] *= loss_grad[sample_idx]; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
logit_grad[i] *= loss_grad[i / class_num];
} }
} }
...@@ -94,22 +94,22 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -94,22 +94,22 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
const int batch_size = logit_grad->dims()[0]; const int batch_size = logit_grad->dims()[0];
const int class_num = logit_grad->dims()[1]; const int class_num = logit_grad->dims()[1];
int block = 512; int block = 512;
int grid = (batch_size * class_num + block - 1) / block; auto stream = context.cuda_device_context().stream();
if (context.Attr<bool>("soft_label")) { if (context.Attr<bool>("soft_label")) {
int grid = (batch_size * class_num + block - 1) / block;
const T* label_data = labels->data<T>(); const T* label_data = labels->data<T>();
SoftCrossEntropyGradientKernel< SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
T><<<grid, block, 0, logit_grad_data, loss_grad_data, label_data, batch_size, class_num);
context.template device_context<platform::CUDADeviceContext>()
.stream()>>>(logit_grad_data, loss_grad_data, label_data,
batch_size, class_num);
} else { } else {
int grid = (batch_size + block - 1) / block;
const int64_t* label_data = labels->data<int64_t>(); const int64_t* label_data = labels->data<int64_t>();
CrossEntropyGrad< CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
T><<<grid, block, 0, logit_grad_data, label_data, batch_size, class_num);
context.template device_context<platform::CUDADeviceContext>() int num = batch_size * class_num;
.stream()>>>(logit_grad_data, loss_grad_data, label_data, grid = (num + block - 1) / block;
batch_size, class_num); Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
class_num);
} }
} }
}; };
......
...@@ -26,7 +26,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): ...@@ -26,7 +26,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softmax_with_cross_entropy" self.op_type = "softmax_with_cross_entropy"
batch_size = 2 batch_size = 41
class_num = 37 class_num = 37
logits = np.random.uniform(0.1, 1.0, logits = np.random.uniform(0.1, 1.0,
...@@ -59,7 +59,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest): ...@@ -59,7 +59,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softmax_with_cross_entropy" self.op_type = "softmax_with_cross_entropy"
batch_size = 2 batch_size = 41
class_num = 37 class_num = 37
logits = np.random.uniform(0.1, 1.0, logits = np.random.uniform(0.1, 1.0,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册