提交 02d68051 编写于 作者: J JiabinYang

add sparsed bias grad, test=develop

上级 42470f14
...@@ -107,8 +107,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -107,8 +107,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
"it should have shape like [N, L], L is the length of the Path") "it should have shape like [N, L], L is the length of the Path")
.AsDispensable(); .AsDispensable();
AddInput("Bias", AddInput("Bias",
"(LoDTensor, optional), The bias is a tensor with shape" "(LoDTensor, optional), The bias is a tensor with shape or "
"[1, num_classes - 1]."); "[non_leaf_num, 1]"
"[num_classes - 1, 1].");
AddOutput( AddOutput(
"Out", "Out",
"(LoDTensor, required) The output of hierarchical sigmoid operator." "(LoDTensor, required) The output of hierarchical sigmoid operator."
...@@ -148,11 +149,11 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ...@@ -148,11 +149,11 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
"Output(W@Grad should not be null."); "Output(W@Grad should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad should not be null."); "Output(X@Grad should not be null.");
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
if (!ctx->Attrs().Get<bool>("is_sparse")) { if (!ctx->Attrs().Get<bool>("is_sparse")) {
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
} }
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
...@@ -172,20 +173,31 @@ class HierarchicalSigmoidGradOpGradVarTypeInference ...@@ -172,20 +173,31 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override { framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); auto out_W_var_name = op_desc.Output(framework::GradVarName("W")).front();
auto out_Bias_var_name =
op_desc.Output(framework::GradVarName("Bias")).front();
auto attr = op_desc.GetAttr("is_sparse"); auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr); bool is_sparse = boost::get<bool>(attr);
if (is_sparse) { if (is_sparse) {
VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows"; << " is set to SelectedRows";
block->Var(out_var_name) block->Var(out_W_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
VLOG(3) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to SelectedRows";
block->Var(out_Bias_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS); ->SetType(framework::proto::VarType::SELECTED_ROWS);
} else { } else {
VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor"; << " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); block->Var(out_W_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR);
VLOG(3) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to SelectedRows";
block->Var(out_Bias_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR);
} }
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); block->Var(out_W_var_name)->SetDataType(block->Var("W")->GetDataType());
} }
}; };
......
...@@ -124,13 +124,12 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -124,13 +124,12 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto* w = ctx.Input<framework::LoDTensor>("W"); auto* w = ctx.Input<framework::LoDTensor>("W");
auto* path = ctx.Input<framework::LoDTensor>("PTable"); auto* path = ctx.Input<framework::LoDTensor>("PTable");
auto* code = ctx.Input<framework::LoDTensor>("PCode"); auto* code = ctx.Input<framework::LoDTensor>("PCode");
auto* bias = ctx.Input<framework::LoDTensor>("Bias");
auto* in_grad = auto* in_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("X")); ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
bool is_sparse = ctx.Attr<bool>("is_sparse"); bool is_sparse = ctx.Attr<bool>("is_sparse");
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> zero; math::SetConstant<DeviceContext, T> zero;
auto* bias_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
auto* label = ctx.Input<framework::LoDTensor>("Label"); auto* label = ctx.Input<framework::LoDTensor>("Label");
auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut"); auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
auto* out_grad = auto* out_grad =
...@@ -174,12 +173,15 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -174,12 +173,15 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
pre_out_grad_mat * out_grad_mat.broadcast(bcast); pre_out_grad_mat * out_grad_mat.broadcast(bcast);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward. // be consistent with the clipping in forward.
if (bias_grad) {
bias_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, bias_grad, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad);
}
if (!is_sparse) { if (!is_sparse) {
auto* bias_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
if (bias_grad) {
bias_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, bias_grad, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad);
}
auto* w_grad = auto* w_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("W")); ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
w_grad->mutable_data<T>(ctx.GetPlace()); w_grad->mutable_data<T>(ctx.GetPlace());
...@@ -199,6 +201,21 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -199,6 +201,21 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace()); w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
zero(dev_ctx, w_grad_value, static_cast<T>(0.0)); zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
auto* bias_grad =
ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
if (bias_grad) {
bias_grad->set_rows(real_rows);
// build ids -> rows index map
bias_grad->SyncIndex();
bias_grad->set_height(bias->dims()[0]);
auto* bias_grad_value = bias_grad->mutable_value();
std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
bias->dims()[1]};
bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
ctx.GetPlace());
zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad);
}
bit_code->MulGradWeight(pre_out_grad, w_grad, *in); bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
} }
bit_code->MulGradError(pre_out_grad, *w, in_grad); bit_code->MulGradError(pre_out_grad, *w, in_grad);
......
...@@ -48,6 +48,24 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat, ...@@ -48,6 +48,24 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
} }
} }
template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
framework::SelectedRows* vec) {
size_t batch_size = tmat.dims()[0];
size_t width = tmat.dims()[1];
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table->get_code(i);
int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j);
int64_t row_index =
vec->AutoGrownIndex(static_cast<int64_t>(index), false, true);
vec->mutable_value()->data<T>()[row_index] +=
tmat.data<T>()[i * width + j];
}
}
}
template <typename T> template <typename T>
void MatrixBitCodeFunctor<T>::Sum(const framework::LoDTensor& tmat, void MatrixBitCodeFunctor<T>::Sum(const framework::LoDTensor& tmat,
framework::LoDTensor* sum, T scale_sum) { framework::LoDTensor* sum, T scale_sum) {
......
...@@ -241,6 +241,11 @@ class MatrixBitCodeFunctor { ...@@ -241,6 +241,11 @@ class MatrixBitCodeFunctor {
*/ */
void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec); void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec);
/* For selected rows For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void AddGrad(const framework::LoDTensor& tmat, framework::SelectedRows* vec);
/* For j < code_length /* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/ */
......
...@@ -4639,14 +4639,14 @@ def hsigmoid(input, ...@@ -4639,14 +4639,14 @@ def hsigmoid(input,
if not is_costum: if not is_costum:
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, attr=helper.bias_attr,
shape=[1, num_classes - 1], shape=[num_classes - 1, 1],
is_bias=True, is_bias=True,
dtype=input.dtype) dtype=input.dtype)
inputs['Bias'] = bias inputs['Bias'] = bias
else: else:
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, attr=helper.bias_attr,
shape=[1, non_leaf_num], shape=[non_leaf_num, 1],
is_bias=True, is_bias=True,
dtype=input.dtype) dtype=input.dtype)
inputs['Bias'] = bias inputs['Bias'] = bias
......
...@@ -77,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes): ...@@ -77,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length = code_table.get_length() length = code_table.get_length()
for j in range(length): for j in range(length):
idx = code_table.cal_index(j) idx = code_table.cal_index(j)
pre_output[i][j] += bias[0][idx] pre_output[i][j] += bias[idx][0]
for i in range(batch_size): for i in range(batch_size):
code_table = CodeTable(num_classes, label[i]) code_table = CodeTable(num_classes, label[i])
length = code_table.get_length() length = code_table.get_length()
...@@ -115,7 +115,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): ...@@ -115,7 +115,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes):
length = code_table.get_length() length = code_table.get_length()
for j in range(length): for j in range(length):
idx = code_table.cal_index(j) idx = code_table.cal_index(j)
pre_output[i][j] += bias[0][idx] pre_output[i][j] += bias[idx][0]
for i in range(batch_size): for i in range(batch_size):
code_table = CodeTableWithCustomTree(ptable, pcode, i) code_table = CodeTableWithCustomTree(ptable, pcode, i)
length = code_table.get_length() length = code_table.get_length()
...@@ -150,7 +150,7 @@ class TestHSigmoidOp(OpTest): ...@@ -150,7 +150,7 @@ class TestHSigmoidOp(OpTest):
w = np.random.random( w = np.random.random(
(num_classes - 1, feature_size)).astype("float32") * 2 (num_classes - 1, feature_size)).astype("float32") * 2
label = np.random.randint(0, num_classes, (batch_size, 1)) label = np.random.randint(0, num_classes, (batch_size, 1))
bias = np.random.random((1, num_classes - 1)).astype("float32") bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': False} self.attrs = {'num_classes': num_classes, 'is_sparse': False}
self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
pre_output, out = hsigmoid(x, w, label, bias, num_classes) pre_output, out = hsigmoid(x, w, label, bias, num_classes)
...@@ -178,7 +178,7 @@ class TestHSigmoidOpSparse(OpTest): ...@@ -178,7 +178,7 @@ class TestHSigmoidOpSparse(OpTest):
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
bias = np.random.random((1, num_classes - 1)).astype("float32") bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': True} self.attrs = {'num_classes': num_classes, 'is_sparse': True}
self.inputs = { self.inputs = {
'X': x, 'X': x,
...@@ -193,7 +193,6 @@ class TestHSigmoidOpSparse(OpTest): ...@@ -193,7 +193,6 @@ class TestHSigmoidOpSparse(OpTest):
self.outputs = {'PreOut': pre_output, 'Out': out} self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self): def test_check_output(self):
print("checking output in CostumTree")
self.check_output() self.check_output()
...@@ -208,7 +207,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): ...@@ -208,7 +207,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
emb = fluid.layers.embedding( emb = fluid.layers.embedding(
input=input_word, input=input_word,
is_sparse=False, is_sparse=is_sparse,
size=[3, 3], size=[3, 3],
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(3)))) scale=1 / math.sqrt(3))))
...@@ -220,6 +219,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): ...@@ -220,6 +219,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
ptable=ptable, ptable=ptable,
pcode=pcode, pcode=pcode,
is_costum=True, is_costum=True,
bias_attr=True,
is_sparse=is_sparse) is_sparse=is_sparse)
avg_cost = fluid.layers.reduce_mean(cost) avg_cost = fluid.layers.reduce_mean(cost)
...@@ -240,7 +240,6 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): ...@@ -240,7 +240,6 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
optimizer.minimize(loss) optimizer.minimize(loss)
main_program = fluid.default_main_program() main_program = fluid.default_main_program()
# print("main program: {program}".format{program=str(main_program)})
place = fluid.CPUPlace() place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=data_list, place=place) feeder = fluid.DataFeeder(feed_list=data_list, place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -279,7 +278,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): ...@@ -279,7 +278,7 @@ class TestHSigmoidOpWithCostumTree(OpTest):
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
bias = np.random.random((1, num_classes - 1)).astype("float32") bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': False} self.attrs = {'num_classes': num_classes, 'is_sparse': False}
self.inputs = { self.inputs = {
'X': x, 'X': x,
...@@ -294,11 +293,9 @@ class TestHSigmoidOpWithCostumTree(OpTest): ...@@ -294,11 +293,9 @@ class TestHSigmoidOpWithCostumTree(OpTest):
self.outputs = {'PreOut': pre_output, 'Out': out} self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self): def test_check_output(self):
print("checking output in CostumTree")
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
print("checking outputGrad in CostumTree")
self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册