diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 3f34a2d52d68ca1f6f8f8656bb423f2ac40cbbff..283021eae53bc49dc121f6ee2a1125a4cff6266e 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -42,10 +42,9 @@ __device__ __forceinline__ T sum_single_warp(T val) { return val; } -// This kernel is called when the class number is less than or equal to 512. template -__global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label, - const int class_num) { +__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, + const int class_num) { int tid = threadIdx.x; extern __shared__ T d_sum[]; d_sum[tid] = 0; @@ -69,33 +68,6 @@ __global__ void SoftCrossEntropyKernel1(T* Y, const T* X, const T* label, if (tid == 0) Y[blockIdx.x] = -val; } -// This kernel is called when the class number is larger than 512. -template -__global__ void SoftCrossEntropyKernel2(T* Y, const T* X, const T* label, - const int class_num) { - int tid = threadIdx.x; - __shared__ T d_sum[BlockSize]; - int next_idx = blockIdx.x * class_num + tid; - - d_sum[tid] = 0; - int cur_idx = tid; - while (cur_idx < class_num) { - d_sum[tid] += TolerableValue()(std::log(X[next_idx])) * label[next_idx]; - next_idx += BlockSize; - cur_idx += BlockSize; - } - __syncthreads(); - - for (unsigned int stride = BlockSize >> 1; stride >= 32; stride >>= 1) { - if (tid < stride) d_sum[tid] += d_sum[tid + stride]; - __syncthreads(); - } - - T val = d_sum[tid]; - val = sum_single_warp(val); - if (tid == 0) Y[blockIdx.x] = -val; -} - // TODO(qingqing): make zero setting a common function. template __global__ void zero(T* X, const int N) { @@ -146,26 +118,19 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { int batch_size = x->dims()[0]; int class_num = x->dims()[1]; - int block = 512; if (ctx.Attr("soft_label")) { auto* label_data = ctx.Input("Label")->data(); - if (class_num > 512) { - SoftCrossEntropyKernel2< - T, 512><<( - ctx.device_context()) - .stream()>>>(y_data, x_data, label_data, class_num); - } else { - int block_size = pow(2, int(std::log2(class_num))); - SoftCrossEntropyKernel1< - T><<( - ctx.device_context()) - .stream()>>>(y_data, x_data, label_data, class_num); - } + int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num))); + + SoftCrossEntropyKernel< + T><<( + ctx.device_context()) + .stream()>>>(y_data, x_data, label_data, class_num); } else { auto* label_data = ctx.Input("Label")->data(); + int block = 512; int grid = (batch_size + block - 1) / block; CrossEntropyKernel<<< grid, block, 0, reinterpret_cast( diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py index 1715a4db345d1b123fca2266d198ee1622234501..52032f3a698005393b6766a941697267daa2bd55 100644 --- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py +++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py @@ -4,19 +4,21 @@ from op_test import OpTest class TestCrossEntropyOp1(OpTest): - """Test standard cross-entropy, with index representation of labels. + """Test cross-entropy with discrete one-hot labels. """ def setUp(self): self.op_type = "cross_entropy" batch_size = 30 class_num = 10 + X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label = np.random.randint(0, class_num, (batch_size, 1), dtype="int32") cross_entropy = np.asmatrix( [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])], dtype="float32") + self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} self.attrs = {"soft_label": False} @@ -29,14 +31,14 @@ class TestCrossEntropyOp1(OpTest): class TestCrossEntropyOp2(OpTest): - """Test soft-label cross-entropy, with vecterized soft labels. + """Test cross-entropy with vectorized soft labels. """ def setUp(self): self.op_type = "cross_entropy" batch_size = 5 - # this setting tests threads in more than one wrap. class_num = 37 + X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label = np.random.uniform(0.1, 1.0, @@ -44,6 +46,7 @@ class TestCrossEntropyOp2(OpTest): label /= label.sum(axis=1, keepdims=True) cross_entropy = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") + self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} self.attrs = {"soft_label": True} @@ -56,15 +59,14 @@ class TestCrossEntropyOp2(OpTest): class TestCrossEntropyOp3(OpTest): - """Test one-hot cross-entropy, with vecterized one-hot representation of - labels. + """Test cross-entropy with vectorized one-hot representation of labels. """ def setUp(self): self.op_type = "cross_entropy" batch_size = 5 - # this setting tests all threads in one wrap. class_num = 17 + X = np.random.uniform(0.1, 1.0, [batch_size, class_num]).astype("float32") label_index = np.random.randint( @@ -76,33 +78,7 @@ class TestCrossEntropyOp3(OpTest): dtype="float32") cross_entropy2 = (-label * np.log(X)).sum( axis=1, keepdims=True).astype("float32") - self.inputs = {"X": X, "Label": label} - self.outputs = {"Y": cross_entropy} - self.attrs = {"soft_label": True} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(["X"], "Y", max_relative_error=0.05) - - -class TestCrossEntropyOp4(OpTest): - """Test soft-label cross-entropy. - This unittest tests the gpu kernel for layer size excesses 512. - """ - def setUp(self): - self.op_type = "cross_entropy" - batch_size = 2 - class_num = 517 - X = np.random.uniform(0.1, 1.0, - [batch_size, class_num]).astype("float32") - label = np.random.uniform(0.1, 1.0, - [batch_size, class_num]).astype("float32") - label /= label.sum(axis=1, keepdims=True) - cross_entropy = (-label * np.log(X)).sum( - axis=1, keepdims=True).astype("float32") self.inputs = {"X": X, "Label": label} self.outputs = {"Y": cross_entropy} self.attrs = {"soft_label": True}