提交 a3a8a090 编写于 作者: C caoying03

optimize cross entropy kernel by using reduce.

上级 414a7a1e
...@@ -32,16 +32,33 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label, ...@@ -32,16 +32,33 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
} }
} }
template <typename T> template <typename T, int blockSize>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int N, const int D) { const int N, const int D) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; int tid = threadIdx.x;
i += blockDim.x * gridDim.x) { __shared__ T d_sum[blockSize];
T sum = static_cast<T>(0); int next_idx = blockIdx.x * D + tid;
for (int j = 0; j < D; j++) {
sum += label[i * D + j] * tolerable_value(log(X[i * D + j])); d_sum[tid] = 0;
int cur_idx = tid;
while (cur_idx < D) {
d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx];
next_idx += blockSize;
cur_idx += blockSize;
}
__syncthreads();
for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
__syncthreads();
if (tid < stride) {
next_idx = tid + stride;
d_sum[tid] += d_sum[next_idx];
} }
Y[i] = -sum; }
__syncthreads();
if (tid == 0) {
Y[blockIdx.x] = -d_sum[0];
} }
} }
...@@ -104,8 +121,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { ...@@ -104,8 +121,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
// base on ExecutionContext. // base on ExecutionContext.
if (ctx.Attr<int>("soft_label") == 1) { if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>(); auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, grid = d;
d); SoftCrossEntropyKernel<T, 512><<<grid, block>>>(y_data, x_data,
label_data, n, d);
} else { } else {
auto* label_data = ctx.Input<Tensor>("Label")->data<int>(); auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d); CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
......
...@@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest): ...@@ -19,7 +19,7 @@ class TestCrossEntropyOp1(OpTest):
dtype="float32") dtype="float32")
self.inputs = {"X": X, "Label": label} self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy} self.outputs = {"Y": cross_entropy}
self.attrs = {'soft_label': 0} self.attrs = {"soft_label": 0}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -34,8 +34,8 @@ class TestCrossEntropyOp2(OpTest): ...@@ -34,8 +34,8 @@ class TestCrossEntropyOp2(OpTest):
def setUp(self): def setUp(self):
self.op_type = "cross_entropy" self.op_type = "cross_entropy"
batch_size = 10 batch_size = 13
class_num = 5 class_num = 37
X = np.random.uniform(0.1, 1.0, X = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32") [batch_size, class_num]).astype("float32")
label = np.random.uniform(0.1, 1.0, label = np.random.uniform(0.1, 1.0,
...@@ -43,15 +43,16 @@ class TestCrossEntropyOp2(OpTest): ...@@ -43,15 +43,16 @@ class TestCrossEntropyOp2(OpTest):
label /= label.sum(axis=1, keepdims=True) label /= label.sum(axis=1, keepdims=True)
cross_entropy = (-label * np.log(X)).sum( cross_entropy = (-label * np.log(X)).sum(
axis=1, keepdims=True).astype("float32") axis=1, keepdims=True).astype("float32")
self.inputs = {'X': X, 'Label': label}
self.outputs = {'Y': cross_entropy} self.inputs = {"X": X, "Label": label}
self.attrs = {'soft_label': 1} self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": 1}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Y') self.check_grad(["X"], "Y", max_relative_error=0.05)
class TestCrossEntropyOp3(OpTest): class TestCrossEntropyOp3(OpTest):
...@@ -61,8 +62,8 @@ class TestCrossEntropyOp3(OpTest): ...@@ -61,8 +62,8 @@ class TestCrossEntropyOp3(OpTest):
def setUp(self): def setUp(self):
self.op_type = "cross_entropy" self.op_type = "cross_entropy"
batch_size = 30 batch_size = 13
class_num = 10 class_num = 37
X = np.random.uniform(0.1, 1.0, X = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32") [batch_size, class_num]).astype("float32")
label_index = np.random.randint( label_index = np.random.randint(
...@@ -74,15 +75,15 @@ class TestCrossEntropyOp3(OpTest): ...@@ -74,15 +75,15 @@ class TestCrossEntropyOp3(OpTest):
dtype="float32") dtype="float32")
cross_entropy2 = (-label * np.log(X)).sum( cross_entropy2 = (-label * np.log(X)).sum(
axis=1, keepdims=True).astype("float32") axis=1, keepdims=True).astype("float32")
self.inputs = {'X': X, 'Label': label} self.inputs = {"X": X, "Label": label}
self.outputs = {'Y': cross_entropy} self.outputs = {"Y": cross_entropy}
self.attrs = {'soft_label': 1} self.attrs = {"soft_label": 1}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Y') self.check_grad(["X"], "Y", max_relative_error=0.05)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册