提交 141b8dbc 编写于 作者: C caoying03

update the backward kernel.

上级 a3a8a090
......@@ -28,27 +28,27 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
Y[i] = -tolerable_value(log(X[i * D + label[i]]));
Y[i] = -TolerableValue<T>()(log(X[i * D + label[i]]));
}
}
template <typename T, int blockSize>
template <typename T, int BlockSize>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int N, const int D) {
int tid = threadIdx.x;
__shared__ T d_sum[blockSize];
__shared__ T d_sum[BlockSize];
int next_idx = blockIdx.x * D + tid;
d_sum[tid] = 0;
int cur_idx = tid;
while (cur_idx < D) {
d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx];
next_idx += blockSize;
cur_idx += blockSize;
d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
next_idx += BlockSize;
cur_idx += BlockSize;
}
__syncthreads();
for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
for (int stride = BlockSize >> 1; stride > 0; stride >>= 1) {
__syncthreads();
if (tid < stride) {
next_idx = tid + stride;
......@@ -88,13 +88,12 @@ template <typename T>
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const T* label, const int N,
const int D) {
// TOOD(qingqing): optimize for this kernel
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
for (int j = 0; j < D; ++j) {
int idx = i * D + j;
dX[idx] = -label[idx] * dY[i] / X[idx];
}
int row_ids = blockIdx.x * blockDim.x + threadIdx.x;
int col_ids = blockIdx.y * blockDim.y + threadIdx.y;
int ids = row_ids * D + col_ids;
if (ids < N * D) {
dX[ids] = -label[ids] * dY[row_ids] / X[ids];
}
}
......@@ -103,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"This kernel only runs on GPU device.");
auto x = ctx.Input<Tensor>("X");
auto y = ctx.Output<Tensor>("Y");
......@@ -136,7 +135,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"This kernel only runs on GPU device.");
auto x = ctx.Input<Tensor>("X");
auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
......@@ -156,6 +155,11 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
// TODO(qingqing): launch kernel on specified stream
// base on ExecutionContext.
if (ctx.Attr<int>("soft_label") == 1) {
int block_x = 32;
int block_y = 32;
dim3 block(block_x, block_y);
dim3 grid((n + block_x - 1) / block_x, (d + block_y - 1) / block_y);
auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
dx_data, dy_data, x_data, label_data, n, d);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/hostdevice.h"
......@@ -20,19 +21,25 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
HOSTDEVICE T tolerable_value(const T x) {
PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20;
if (x == INFINITY) {
return kApproInf;
}
if (x == -INFINITY) {
return -kApproInf;
struct TolerableValue {
HOSTDEVICE T operator()(const T& x) const {
PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20;
if (x == INFINITY) {
return kApproInf;
}
if (x == -INFINITY) {
return -kApproInf;
}
return x;
}
return x;
}
};
template <typename T>
class CrossEntropyOpKernel : public framework::OpKernel {
......@@ -40,33 +47,34 @@ class CrossEntropyOpKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto x = ctx.Input<Tensor>("X");
auto y = ctx.Output<Tensor>("Y");
auto* x_data = x->data<T>();
const Tensor* x = ctx.Input<Tensor>("X");
const Tensor* labels = ctx.Input<Tensor>("Label");
Tensor* y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
int batch_size = x->dims()[0];
int class_num = x->dims()[1];
const int batch_size = x->dims()[0];
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
int index = 0;
for (int i = 0; i < batch_size; ++i) {
T sum = static_cast<T>(0);
for (int j = 0; j < class_num; ++j) {
sum += label_data[index] * tolerable_value(std::log(x_data[index]));
y_data[i] = -sum;
index++;
}
}
auto prob = EigenMatrix<T>::From(*x);
auto lbl_mat = EigenMatrix<T>::From(*labels);
auto loss = EigenMatrix<T>::From(*y);
// loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
// prob.log().unaryExpr(TolerableValue<T>());
loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
-((lbl_mat * prob.log())
.sum(Eigen::DSizes<int, 1>(1))
.reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
} else {
auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
const int class_num = x->dims()[1];
const T* x_data = x->data<T>();
T* y_data = y->data<T>();
const int* label_data = labels->data<int>();
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
y_data[i] = -tolerable_value(std::log(x_data[index]));
y_data[i] = -TolerableValue<T>()(std::log(x_data[index]));
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册