提交 141b8dbc 编写于 作者: C caoying03

update the backward kernel.

上级 a3a8a090
...@@ -28,27 +28,27 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label, ...@@ -28,27 +28,27 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) { i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D); PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
Y[i] = -tolerable_value(log(X[i * D + label[i]])); Y[i] = -TolerableValue<T>()(log(X[i * D + label[i]]));
} }
} }
template <typename T, int blockSize> template <typename T, int BlockSize>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int N, const int D) { const int N, const int D) {
int tid = threadIdx.x; int tid = threadIdx.x;
__shared__ T d_sum[blockSize]; __shared__ T d_sum[BlockSize];
int next_idx = blockIdx.x * D + tid; int next_idx = blockIdx.x * D + tid;
d_sum[tid] = 0; d_sum[tid] = 0;
int cur_idx = tid; int cur_idx = tid;
while (cur_idx < D) { while (cur_idx < D) {
d_sum[tid] += tolerable_value(std::log(X[next_idx])) * label[next_idx]; d_sum[tid] += TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
next_idx += blockSize; next_idx += BlockSize;
cur_idx += blockSize; cur_idx += BlockSize;
} }
__syncthreads(); __syncthreads();
for (int stride = blockSize >> 1; stride > 0; stride >>= 1) { for (int stride = BlockSize >> 1; stride > 0; stride >>= 1) {
__syncthreads(); __syncthreads();
if (tid < stride) { if (tid < stride) {
next_idx = tid + stride; next_idx = tid + stride;
...@@ -88,13 +88,12 @@ template <typename T> ...@@ -88,13 +88,12 @@ template <typename T>
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X, __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const T* label, const int N, const T* label, const int N,
const int D) { const int D) {
// TOOD(qingqing): optimize for this kernel int row_ids = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; int col_ids = blockIdx.y * blockDim.y + threadIdx.y;
i += blockDim.x * gridDim.x) { int ids = row_ids * D + col_ids;
for (int j = 0; j < D; ++j) {
int idx = i * D + j; if (ids < N * D) {
dX[idx] = -label[idx] * dY[i] / X[idx]; dX[ids] = -label[ids] * dY[row_ids] / X[ids];
}
} }
} }
...@@ -103,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { ...@@ -103,7 +102,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "This kernel only runs on GPU device.");
auto x = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
auto y = ctx.Output<Tensor>("Y"); auto y = ctx.Output<Tensor>("Y");
...@@ -136,7 +135,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { ...@@ -136,7 +135,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "This kernel only runs on GPU device.");
auto x = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
auto dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
...@@ -156,6 +155,11 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { ...@@ -156,6 +155,11 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
// TODO(qingqing): launch kernel on specified stream // TODO(qingqing): launch kernel on specified stream
// base on ExecutionContext. // base on ExecutionContext.
if (ctx.Attr<int>("soft_label") == 1) { if (ctx.Attr<int>("soft_label") == 1) {
int block_x = 32;
int block_y = 32;
dim3 block(block_x, block_y);
dim3 grid((n + block_x - 1) / block_x, (d + block_y - 1) / block_y);
auto* label_data = label->data<T>(); auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<<grid, block>>>( SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
dx_data, dy_data, x_data, label_data, n, d); dx_data, dy_data, x_data, label_data, n, d);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/hostdevice.h" #include "paddle/platform/hostdevice.h"
...@@ -20,11 +21,16 @@ namespace paddle { ...@@ -20,11 +21,16 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename T>
HOSTDEVICE T tolerable_value(const T x) { struct TolerableValue {
HOSTDEVICE T operator()(const T& x) const {
PADDLE_ASSERT(std::is_floating_point<T>::value); PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20; const T kApproInf = 1e20;
if (x == INFINITY) { if (x == INFINITY) {
return kApproInf; return kApproInf;
} }
...@@ -32,7 +38,8 @@ HOSTDEVICE T tolerable_value(const T x) { ...@@ -32,7 +38,8 @@ HOSTDEVICE T tolerable_value(const T x) {
return -kApproInf; return -kApproInf;
} }
return x; return x;
} }
};
template <typename T> template <typename T>
class CrossEntropyOpKernel : public framework::OpKernel { class CrossEntropyOpKernel : public framework::OpKernel {
...@@ -40,33 +47,34 @@ class CrossEntropyOpKernel : public framework::OpKernel { ...@@ -40,33 +47,34 @@ class CrossEntropyOpKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
const Tensor* x = ctx.Input<Tensor>("X");
auto x = ctx.Input<Tensor>("X"); const Tensor* labels = ctx.Input<Tensor>("Label");
auto y = ctx.Output<Tensor>("Y"); Tensor* y = ctx.Output<Tensor>("Y");
auto* x_data = x->data<T>();
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
int batch_size = x->dims()[0];
int class_num = x->dims()[1];
const int batch_size = x->dims()[0];
if (ctx.Attr<int>("soft_label") == 1) { if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>(); auto prob = EigenMatrix<T>::From(*x);
int index = 0; auto lbl_mat = EigenMatrix<T>::From(*labels);
for (int i = 0; i < batch_size; ++i) { auto loss = EigenMatrix<T>::From(*y);
T sum = static_cast<T>(0);
for (int j = 0; j < class_num; ++j) { // loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
sum += label_data[index] * tolerable_value(std::log(x_data[index])); // prob.log().unaryExpr(TolerableValue<T>());
y_data[i] = -sum;
index++; loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
} -((lbl_mat * prob.log())
} .sum(Eigen::DSizes<int, 1>(1))
.reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
} else { } else {
auto* label_data = ctx.Input<Tensor>("Label")->data<int>(); const int class_num = x->dims()[1];
const T* x_data = x->data<T>();
T* y_data = y->data<T>();
const int* label_data = labels->data<int>();
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i]; int index = i * class_num + label_data[i];
y_data[i] = -tolerable_value(std::log(x_data[index])); y_data[i] = -TolerableValue<T>()(std::log(x_data[index]));
} }
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册