提交 f0716491 编写于 作者: Y Yancey1989

fix backward

上级 1399e5a3
......@@ -61,10 +61,8 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Parameters"),
"Input(Parameters)"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
const int64_t batch_size = ctx->GetInputDim("X")[0];
std::vector<int64_t> output_shape({batch_size, 1});
......@@ -84,15 +82,17 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Parameters"),
"Input(Parameters)"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input(Label)"
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Parameters")),
"Input(Parameters@Grad should not be null.)");
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
"Input(W@Grad should not be null.)");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
protected:
......@@ -112,11 +112,11 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor, required) The input Tensor, which the shape is"
"[N * D], which N is the size of mini-batch,"
"D is the embded size");
AddInput("Parameters",
AddInput("W",
"(Tensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is s a 3-D tensor, the shape is"
"[N, num_classes - 1, D]");
AddInput("Label",
AddInput("Ids",
"(Tensor, required), The labels of training data. It's a"
"1-D tensor, which the shape is [1, N]");
AddInput("Bias",
......
......@@ -32,15 +32,14 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* params = ctx.Input<framework::Tensor>("Parameters");
auto* label = ctx.Input<framework::Tensor>("Label");
auto* w = ctx.Input<framework::Tensor>("W");
auto* ids = ctx.Input<framework::Tensor>("Ids");
auto* bias = ctx.Input<framework::Tensor>("Bias");
auto* out = ctx.Output<framework::Tensor>("Out");
size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
int64_t code_length = math::FindLastSet(num_classes - 1);
int64_t batch_size = in->dims()[0];
auto* ids = label->data<int64_t>();
framework::Tensor pre_out;
framework::Tensor sum;
auto pre_out_data = pre_out.mutable_data<T>(
......@@ -59,18 +58,19 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
auto out_mat = framework::EigenVector<T>::Flatten(*out);
if (bias) {
bit_code.Add(num_classes, ids, pre_out, *bias);
bit_code.Add(num_classes, ids->data<int64_t>(), pre_out, *bias);
}
for (int i = 0; i < in->dims()[0]; ++i) {
bit_code.Mul(num_classes, ids, pre_out, params->Slice(i, i + 1),
in->Slice(i, i + 1));
bit_code.Mul(num_classes, ids->data<int64_t>(), pre_out,
w->Slice(i, i + 1), in->Slice(i, i + 1));
}
// clip the matrix with (-40, 40)
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out.numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
bit_code.Sum(num_classes, ids, pre_out, *out, static_cast<T>(-1));
bit_code.Sum(num_classes, ids->data<int64_t>(), pre_out, *out,
static_cast<T>(-1));
// softrelu with threshold is 40.0
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out.numel(), pre_out_data,
......@@ -88,10 +88,9 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto* params =
ctx.Output<framework::Tensor>(framework::GradVarName("Parameters"));
auto* w = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
auto* bias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
auto* label = ctx.Input<framework::Tensor>("Label");
auto* ids = ctx.Input<framework::Tensor>("Ids");
size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
int64_t code_length = math::FindLastSet(num_classes - 1);
int64_t batch_size = in->dims()[0];
......@@ -102,8 +101,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto& device_ctx = ctx.template device_context<DeviceContext>();
auto pre_out_mat = EigenMatrix<T>::From(pre_out);
auto* ids = label->data<int64_t>();
// init pre_out matrix with {1.0}
math::SetConstant<DeviceContext, T> one;
math::MatrixBitCodeFunctor<T> bit_code;
......@@ -112,19 +109,22 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
pre_out_mat.device(place) =
pre_out_mat * (static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat);
bit_code.Sub(num_classes, ids, pre_out);
bit_code.Sub(num_classes, ids->data<int64_t>(), pre_out);
if (bias) {
bit_code.AddGrad(num_classes, ids, pre_out, *bias);
bias->mutable_data<T>(ctx.GetPlace());
bit_code.AddGrad(num_classes, ids->data<int64_t>(), pre_out, *bias);
}
in_grad->mutable_data<T>(ctx.GetPlace());
w->mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < in_grad->dims()[0]; ++i) {
auto p_sliced = params->Slice(i, i + 1);
auto p_sliced = w->Slice(i, i + 1);
auto in_sliced = in->Slice(i, i + 1);
auto in_grad_sliced = in_grad->Slice(i, i + 1);
bit_code.MulGradWeight(num_classes, ids, pre_out, p_sliced, in_sliced);
bit_code.MulGradError(num_classes, ids, pre_out, p_sliced,
in_grad_sliced);
bit_code.MulGradWeight(num_classes, ids->data<int64_t>(), pre_out,
p_sliced, in_sliced);
bit_code.MulGradError(num_classes, ids->data<int64_t>(), pre_out,
p_sliced, in_grad_sliced);
}
}
};
......
......@@ -56,7 +56,6 @@ static void AddByBitCodeT(Op op, CodeTable code_table, const int64_t* codes,
const framework::Tensor& vec) {
size_t num_sample = tmat.dims()[0];
size_t width = vec.dims()[1];
for (size_t i = 0; i < num_sample; ++i) {
auto code = code_table(static_cast<size_t>(codes[i]));
int code_length = code.get_length();
......
......@@ -109,8 +109,6 @@ PYBIND11_PLUGIN(core) {
.def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>)
.def("get_float_element", TensorGetElement<float>)
.def("set_int64_element", TensorSetElement<int64_t>)
.def("get_int64_element", TensorGetElement<int64_t>)
.def("set_double_element", TensorSetElement<double>)
.def("get_double_element", TensorGetElement<double>)
.def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
......
......@@ -148,7 +148,6 @@ class Executor(object):
inputs={'X': [var]},
outputs={'Out': [fetch_var]},
attrs={'col': i})
self.executor.run(program.desc, scope, 0, True, True)
outs = [
core.get_fetch_variable(scope, fetch_var_name, i)
......
......@@ -123,8 +123,6 @@ def get_numeric_gradient(scope,
def __set_elem__(tensor, i, e):
if tensor_to_check_dtype == np.float32:
tensor.set_float_element(i, e)
elif tensor_to_check_dtype == np.int64:
tensor.set_int64_element(i, e)
else:
tensor.set_double_element(i, e)
......
......@@ -10,16 +10,11 @@ class TestHSigmoidOp(OpTest):
embded_size = 10
batch_size = 5
x = np.random.random((batch_size, embded_size)).astype("float32")
parameter = np.random.random(
w = np.random.random(
(batch_size, num_classes - 1, embded_size)).astype("float32")
label = np.random.randint(0, num_classes, batch_size)
ids = np.random.randint(0, num_classes, batch_size)
bias = np.random.random((1, num_classes - 1)).astype("float32")
self.inputs = {
'X': x,
'Parameters': parameter,
'Label': label,
'Bias': bias
}
self.inputs = {'X': x, 'W': w, 'Ids': ids, 'Bias': bias}
self.attrs = {'num_classes': num_classes}
self.outputs = {
'Out': np.random.random((batch_size, 1)).astype("float32")
......@@ -29,10 +24,7 @@ class TestHSigmoidOp(OpTest):
self.check_output()
def test_check_grad(self):
self.check_grad(
['X', 'Parameters', 'Label', 'Bias'],
'Out',
no_grad_set=set(['Label']))
self.check_grad(['X', 'W', 'Bias'], 'Out', no_grad_set=set('Ids'))
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册