提交 6d60352e 编写于 作者: X Xinghai Sun

Add soft-label support for cross-entropy operator.

上级 0f42e564
...@@ -17,48 +17,62 @@ limitations under the License. */ ...@@ -17,48 +17,62 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class OnehotCrossEntropyOp : public framework::OperatorWithKernel { class CrossEntropyOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
void InferShape(const framework::InferShapeContext &ctx) const override { void InferShape(const framework::InferShapeContext &ctx) const override {
auto *X = ctx.Input<Tensor>("X"); auto *x = ctx.Input<Tensor>("X");
auto *label = ctx.Input<Tensor>("label"); auto *label = ctx.Input<Tensor>("Label");
PADDLE_ENFORCE_EQ(X->dims().size(), 2, "X's dimension must be 2."); PADDLE_ENFORCE_EQ(x->dims().size(), 2, "X's rank must be 2.");
PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1."); PADDLE_ASSERT(label->dims().size() == 1 || label->dims().size() == 2);
PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]); if (label->dims().size() == 2) {
ctx.Output<Tensor>("Y")->Resize({X->dims()[0]}); // soft cross entropy
PADDLE_ENFORCE_EQ(x->dims(), label->dims());
} else {
// normal cross entropy
PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0]);
}
ctx.Output<Tensor>("Y")->Resize({x->dims()[0]});
} }
}; };
class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel { class CrossEntropyGradientOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
void InferShape(const framework::InferShapeContext &ctx) const override { void InferShape(const framework::InferShapeContext &ctx) const override {
auto dX = ctx.Output<Tensor>(framework::GradVarName("X")); auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto X = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
dX->Resize(X->dims()); dx->Resize(x->dims());
} }
}; };
class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
OnehotCrossEntropyOpMaker(framework::OpProto *proto, CrossEntropyOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of OnehotCrossEntropyOp"); AddInput("X", "The first input of CrossEntropyOp");
AddInput("label", "The second input of OnehotCrossEntropyOp"); AddInput("Label", "The second input of CrossEntropyOp");
AddOutput("Y", "The output of OnehotCrossEntropyOp"); AddOutput("Y", "The output of CrossEntropyOp");
AddComment(R"DOC( AddComment(R"DOC(
OnehotCrossEntropy Operator. CrossEntropy Operator.
Y[i] = -log(X[i][j]) The second input (Label tensor) supports two kinds of shapes:
1) Rank(Label) = 1, Label[i] indicates the class index for sample i:
Y[i] = -log(X[i, Label[i]])
2) Rank(Label) = 2, Label[i, j] indicates the soft label of class j
for sample i:
Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
Please make sure that in this case the summuation of each row of Label
equals one. If each row of Label has only one non-zero element (equals 1),
it degenerates to a standard one-hot representation.
)DOC"); )DOC");
} }
}; };
...@@ -66,10 +80,8 @@ OnehotCrossEntropy Operator. ...@@ -66,10 +80,8 @@ OnehotCrossEntropy Operator.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp, REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad, cross_entropy_grad, ops::CrossEntropyGradientOp);
ops::OnehotCrossEntropyGradientOp); REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>);
REGISTER_OP_CPU_KERNEL(onehot_cross_entropy, REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
ops::OnehotCrossEntropyOpKernel<float>); ops::CrossEntropyGradientOpKernel<float>);
REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOpKernel<float>);
...@@ -21,17 +21,16 @@ namespace operators { ...@@ -21,17 +21,16 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T> template <typename T>
__host__ __device__ T clipping_log(const T x) { __host__ __device__ T tolerable_value(const T x) {
PADDLE_ASSERT(std::is_floating_point<T>::value); PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20; const T kApproInf = 1e20;
T v = log(x); if (x == INFINITY) {
if (v == INFINITY) {
return kApproInf; return kApproInf;
} }
if (v == -INFINITY) { if (x == -INFINITY) {
return -kApproInf; return -kApproInf;
} }
return v; return x;
} }
template <typename T> template <typename T>
...@@ -42,7 +41,20 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label, ...@@ -42,7 +41,20 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) { i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D); PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
Y[i] = -clipping_log(X[i * D + label[i]]); Y[i] = -tolerable_value(log(X[i * D + label[i]]));
}
}
template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int N, const int D) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
T sum = static_cast<T>(0);
for (int j = 0; j < D; j++) {
sum += label[i * D + j] * log(X[i * D + j]);
}
Y[i] = -tolerable_value(sum);
} }
} }
...@@ -69,57 +81,89 @@ __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X, ...@@ -69,57 +81,89 @@ __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
} }
template <typename T> template <typename T>
class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel { __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const T* label, const int N,
const int D) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
for (int j = 0; j < D; ++j) {
int idx = i * D + j;
dX[idx] = -label[idx] * dY[i] / X[idx];
}
}
}
template <typename T>
class CrossEntropyOpCUDAKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use GPUPlace.");
auto X = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
const T* Xdata = X->data<T>(); auto y = ctx.Output<Tensor>("Y");
const int* label_data = ctx.Input<Tensor>("label")->data<int>(); auto label = ctx.Input<Tensor>("Label");
auto Y = ctx.Output<Tensor>("Y");
Y->mutable_data<T>(ctx.GetPlace()); auto* x_data = x->data<T>();
T* Ydata = Y->data<T>(); y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
int N = X->dims()[0]; int n = x->dims()[0];
int D = X->dims()[1]; int d = x->dims()[1];
int block = 512; int block = 512;
int grid = (N + block - 1) / block; int grid = (n + block - 1) / block;
// TODO(qingqing) launch kernel on specified stream // TODO(qingqing) launch kernel on specified stream
// base on ExecutionContext. // base on ExecutionContext.
CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D); int label_rank = label->dims().size();
if (label_rank == 2) {
// soft cross entropy
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
d);
} else {
// normal cross entropy
auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
}
} }
}; };
template <typename T> template <typename T>
class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel { class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use GPUPlace.");
auto X = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
auto dX = ctx.Output<Tensor>(framework::GradVarName("X")); auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dY = ctx.Input<Tensor>(framework::GradVarName("Y")); auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("label"); auto label = ctx.Input<Tensor>("Label");
auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace()); auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto* dYdata = dY->template data<T>(); auto* dy_data = dy->data<T>();
auto* Xdata = X->template data<T>(); auto* x_data = x->data<T>();
auto* label_data = label->data<int>();
int N = X->dims()[0]; int n = x->dims()[0];
int D = X->dims()[1]; int d = x->dims()[1];
int block = 512; int block = 512;
int grid = (N * D + block - 1) / block; int grid = (n * d + block - 1) / block;
zero<T><<<grid, block>>>(dXdata, N * D); zero<T><<<grid, block>>>(dx_data, n * d);
grid = (n + block - 1) / block;
grid = (N + block - 1) / block;
// TODO(qingqing): launch kernel on specified stream // TODO(qingqing): launch kernel on specified stream
// base on ExecutionContext. // base on ExecutionContext.
CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata, int label_rank = label->dims().size();
label_data, N, D); if (label_rank == 2) {
// soft cross entropy
auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
dx_data, dy_data, x_data, label_data, n, d);
} else {
// normal cross entropy
auto* label_data = label->data<int>();
CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
label_data, n, d);
}
} }
}; };
...@@ -127,7 +171,6 @@ class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel { ...@@ -127,7 +171,6 @@ class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(onehot_cross_entropy, REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>);
ops::OnehotCrossEntropyOpCUDAKernel<float>); REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad, ops::CrossEntropyGradientOpCUDAKernel<float>);
ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
...@@ -40,56 +40,86 @@ inline T tolerable_value(const T x) { ...@@ -40,56 +40,86 @@ inline T tolerable_value(const T x) {
} }
template <typename T> template <typename T>
class OnehotCrossEntropyOpKernel : public framework::OpKernel { class CrossEntropyOpKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
auto X = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
const T* Xdata = X->data<T>(); auto y = ctx.Output<Tensor>("Y");
const int* label_data = ctx.Input<Tensor>("label")->data<int>();
auto Y = ctx.Output<Tensor>("Y");
Y->mutable_data<T>(ctx.GetPlace()); auto* x_data = x->data<T>();
y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
T* Ydata = Y->data<T>(); int batch_size = x->dims()[0];
int class_num = x->dims()[1];
int batch_size = X->dims()[0]; int label_rank = ctx.Input<Tensor>("Label")->dims().size();
int class_num = X->dims()[1];
if (label_rank == 2) {
// soft cross entropy
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
int index = 0;
for (int i = 0; i < batch_size; ++i) {
T sum = static_cast<T>(0);
for (int j = 0; j < class_num; ++j) {
sum += label_data[index] * std::log(x_data[index]);
y_data[i] = -tolerable_value(sum);
index++;
}
}
} else {
// normal cross entropy
auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i]; int index = i * class_num + label_data[i];
Ydata[i] = -tolerable_value(std::log(Xdata[index])); y_data[i] = -tolerable_value(std::log(x_data[index]));
}
} }
} }
}; };
template <typename T> template <typename T>
class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel { class CrossEntropyGradientOpKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
auto X = ctx.Input<Tensor>("X"); auto x = ctx.Input<Tensor>("X");
auto dX = ctx.Output<Tensor>(framework::GradVarName("X")); auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dY = ctx.Input<Tensor>(framework::GradVarName("Y")); auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("label"); auto label = ctx.Input<Tensor>("Label");
auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace()); auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto* dYdata = dY->template data<T>(); auto* dy_data = dy->data<T>();
auto* Xdata = X->template data<T>(); auto* x_data = x->data<T>();
auto* label_data = label->data<int>();
const int batch_size = X->dims()[0]; int batch_size = x->dims()[0];
const int class_num = X->dims()[1]; int class_num = x->dims()[1];
int label_rank = ctx.Input<Tensor>("Label")->dims().size();
// TODO(qingqing): make zero setting an common function. // TODO(qingqing): make zero setting an common function.
memset(dXdata, 0, sizeof(T) * batch_size * class_num); if (label_rank == 2) {
// soft cross entropy
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
int index = 0;
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < class_num; ++j) {
dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
index++;
}
}
} else {
// normal cross entropy
auto* label_data = label->data<int>();
memset(dx_data, 0, sizeof(T) * batch_size * class_num);
for (int i = 0; i < batch_size; ++i) {
PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
int index = i * class_num + label_data[i]; int index = i * class_num + label_data[i];
dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]); dx_data[index] = -dy_data[i] / x_data[index];
}
} }
} }
}; };
......
...@@ -32,7 +32,7 @@ limitations under the License. */ ...@@ -32,7 +32,7 @@ limitations under the License. */
namespace py = pybind11; namespace py = pybind11;
USE_OP(add); USE_OP(add);
USE_OP(onehot_cross_entropy); USE_OP(cross_entropy);
USE_OP(sgd); USE_OP(sgd);
USE_OP(mul); USE_OP(mul);
USE_OP(mean); USE_OP(mean);
......
...@@ -5,13 +5,13 @@ from op_test import OpTest ...@@ -5,13 +5,13 @@ from op_test import OpTest
class TestCrossEntropy(OpTest): class TestCrossEntropy(OpTest):
def setUp(self): def setUp(self):
self.op_type = "onehot_cross_entropy" self.op_type = "cross_entropy"
batch_size = 30 batch_size = 30
class_num = 10 class_num = 10
X = numpy.random.uniform(0.1, 1.0, X = numpy.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32") [batch_size, class_num]).astype("float32")
label = (class_num / 2) * numpy.ones(batch_size).astype("int32") label = (class_num / 2) * numpy.ones(batch_size).astype("int32")
self.inputs = {'X': X, 'label': label} self.inputs = {'X': X, 'Label': label}
Y = [] Y = []
for i in range(0, batch_size): for i in range(0, batch_size):
Y.append(-numpy.log(X[i][label[i]])) Y.append(-numpy.log(X[i][label[i]]))
...@@ -24,5 +24,26 @@ class TestCrossEntropy(OpTest): ...@@ -24,5 +24,26 @@ class TestCrossEntropy(OpTest):
self.check_grad(['X'], 'Y') self.check_grad(['X'], 'Y')
class TestCrossEntropySoftLabel(OpTest):
def setUp(self):
self.op_type = "cross_entropy"
batch_size = 30
class_num = 10
X = numpy.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32")
label = numpy.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32")
label /= label.sum(axis=1, keepdims=True)
self.inputs = {'X': X, 'Label': label}
Y = (-label * numpy.log(X)).sum(axis=1)
self.outputs = {'Y': numpy.array(Y).astype("float32")}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Y', max_relative_error=0.05)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -128,7 +128,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None): ...@@ -128,7 +128,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
def cross_entropy_layer(net, input, label): def cross_entropy_layer(net, input, label):
cost_name = "cross_entropy_%d" % uniq_id() cost_name = "cross_entropy_%d" % uniq_id()
cross_entropy_op = Operator( cross_entropy_op = Operator(
"onehot_cross_entropy", X=input, label=label, Y=cost_name) "cross_entropy", X=input, label=label, Y=cost_name)
net.append_op(cross_entropy_op) net.append_op(cross_entropy_op)
scope.new_var(cost_name) scope.new_var(cost_name)
net.infer_shape(scope) net.infer_shape(scope)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册