cross_entropy_op.cu 5.9 KB
Newer Older
L
liaogang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */

15
#include "paddle/framework/op_registry.h"
16
#include "paddle/operators/cross_entropy_op.h"
17
#include "paddle/platform/assert.h"
18
#include "paddle/platform/hostdevice.h"
19 20 21 22 23 24 25 26 27 28 29 30

namespace paddle {
namespace operators {

template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
                                   const int N, const int D) {
  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
  // CUDA_1D_KERNEL_LOOP(i, N) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
31 32 33 34
    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
  }
}

35
template <typename T, int blockSize>
36 37
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                       const int N, const int D) {
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
  int tid = threadIdx.x;
  __shared__ T d_sum[blockSize];
  int next_idx = blockIdx.x * D + tid;

  d_sum[tid] = 0;
  int cur_idx = tid;
  while (cur_idx < D) {
    d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx];
    next_idx += blockSize;
    cur_idx += blockSize;
  }
  __syncthreads();

  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
    __syncthreads();
    if (tid < stride) {
      next_idx = tid + stride;
      d_sum[tid] += d_sum[next_idx];
56
    }
57 58 59 60 61
  }
  __syncthreads();

  if (tid == 0) {
    Y[blockIdx.x] = -d_sum[0];
62 63 64
  }
}

65
// TODO(qingqing): make zero setting an common function.
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
template <typename T>
__global__ void zero(T* X, const int N) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
    X[i] = 0.0;
  }
}

template <typename T>
__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                           const int* label, const int N,
                                           const int D) {
  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
  // CUDA_1D_KERNEL_LOOP(i, N) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
    int idx = i * D + label[i];
    dX[idx] = -dY[i] / X[idx];
  }
}

template <typename T>
88 89 90
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                               const T* label, const int N,
                                               const int D) {
91
  // TOOD(qingqing): optimize for this kernel
92 93 94 95 96 97 98 99 100 101 102
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
    for (int j = 0; j < D; ++j) {
      int idx = i * D + j;
      dX[idx] = -label[idx] * dY[i] / X[idx];
    }
  }
}

template <typename T>
class CrossEntropyOpCUDAKernel : public framework::OpKernel {
103 104 105 106 107
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use GPUPlace.");

108 109 110 111 112 113 114
    auto x = ctx.Input<Tensor>("X");
    auto y = ctx.Output<Tensor>("Y");
    auto label = ctx.Input<Tensor>("Label");

    auto* x_data = x->data<T>();
    y->mutable_data<T>(ctx.GetPlace());
    auto* y_data = y->data<T>();
115

116 117
    int n = x->dims()[0];
    int d = x->dims()[1];
118
    int block = 512;
119
    int grid = (n + block - 1) / block;
120 121
    // TODO(qingqing) launch kernel on specified stream
    // base on ExecutionContext.
122
    if (ctx.Attr<int>("soft_label") == 1) {
123
      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
124 125 126
      grid = d;
      SoftCrossEntropyKernel<T, 512><<<grid, block>>>(y_data, x_data,
                                                      label_data, n, d);
127 128 129 130
    } else {
      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
      CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
    }
131 132 133 134
  }
};

template <typename T>
135
class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
136 137 138 139 140
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use GPUPlace.");

141 142 143 144
    auto x = ctx.Input<Tensor>("X");
    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
    auto label = ctx.Input<Tensor>("Label");
145

146 147 148
    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
    auto* dy_data = dy->data<T>();
    auto* x_data = x->data<T>();
149

150 151
    int n = x->dims()[0];
    int d = x->dims()[1];
152
    int block = 512;
153 154 155
    int grid = (n * d + block - 1) / block;
    zero<T><<<grid, block>>>(dx_data, n * d);
    grid = (n + block - 1) / block;
156 157
    // TODO(qingqing): launch kernel on specified stream
    // base on ExecutionContext.
158
    if (ctx.Attr<int>("soft_label") == 1) {
159 160 161 162 163 164 165 166
      auto* label_data = label->data<T>();
      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
          dx_data, dy_data, x_data, label_data, n, d);
    } else {
      auto* label_data = label->data<int>();
      CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
                                                     label_data, n, d);
    }
167 168 169 170 171
  }
};

}  // namespace operators
}  // namespace paddle
Q
Qiao Longfei 已提交
172

D
dongzhihong 已提交
173
namespace ops = paddle::operators;
174 175 176
REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
                       ops::CrossEntropyGradientOpCUDAKernel<float>);