提交 02d68051 编写于 作者: J JiabinYang

add sparsed bias grad, test=develop

上级 42470f14
......@@ -107,8 +107,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
"it should have shape like [N, L], L is the length of the Path")
.AsDispensable();
AddInput("Bias",
"(LoDTensor, optional), The bias is a tensor with shape"
"[1, num_classes - 1].");
"(LoDTensor, optional), The bias is a tensor with shape or "
"[non_leaf_num, 1]"
"[num_classes - 1, 1].");
AddOutput(
"Out",
"(LoDTensor, required) The output of hierarchical sigmoid operator."
......@@ -148,11 +149,11 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
"Output(W@Grad should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad should not be null.");
if (!ctx->Attrs().Get<bool>("is_sparse")) {
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
if (!ctx->Attrs().Get<bool>("is_sparse")) {
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
}
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
......@@ -172,20 +173,31 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
auto out_W_var_name = op_desc.Output(framework::GradVarName("W")).front();
auto out_Bias_var_name =
op_desc.Output(framework::GradVarName("Bias")).front();
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr);
if (is_sparse) {
VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows";
block->Var(out_var_name)
block->Var(out_W_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
VLOG(3) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to SelectedRows";
block->Var(out_Bias_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
block->Var(out_W_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR);
VLOG(3) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to SelectedRows";
block->Var(out_Bias_var_name)
->SetType(framework::proto::VarType::LOD_TENSOR);
}
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
block->Var(out_W_var_name)->SetDataType(block->Var("W")->GetDataType());
}
};
......
......@@ -124,13 +124,12 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto* w = ctx.Input<framework::LoDTensor>("W");
auto* path = ctx.Input<framework::LoDTensor>("PTable");
auto* code = ctx.Input<framework::LoDTensor>("PCode");
auto* bias = ctx.Input<framework::LoDTensor>("Bias");
auto* in_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
bool is_sparse = ctx.Attr<bool>("is_sparse");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> zero;
auto* bias_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
auto* label = ctx.Input<framework::LoDTensor>("Label");
auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
auto* out_grad =
......@@ -174,12 +173,15 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
pre_out_grad_mat * out_grad_mat.broadcast(bcast);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if (!is_sparse) {
auto* bias_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
if (bias_grad) {
bias_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, bias_grad, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad);
}
if (!is_sparse) {
auto* w_grad =
ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
w_grad->mutable_data<T>(ctx.GetPlace());
......@@ -199,6 +201,21 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
auto* bias_grad =
ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
if (bias_grad) {
bias_grad->set_rows(real_rows);
// build ids -> rows index map
bias_grad->SyncIndex();
bias_grad->set_height(bias->dims()[0]);
auto* bias_grad_value = bias_grad->mutable_value();
std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
bias->dims()[1]};
bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
ctx.GetPlace());
zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
bit_code->AddGrad(pre_out_grad, bias_grad);
}
bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
}
bit_code->MulGradError(pre_out_grad, *w, in_grad);
......
......@@ -48,6 +48,24 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
framework::SelectedRows* vec) {
size_t batch_size = tmat.dims()[0];
size_t width = tmat.dims()[1];
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table->get_code(i);
int code_length = code->get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j);
int64_t row_index =
vec->AutoGrownIndex(static_cast<int64_t>(index), false, true);
vec->mutable_value()->data<T>()[row_index] +=
tmat.data<T>()[i * width + j];
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::Sum(const framework::LoDTensor& tmat,
framework::LoDTensor* sum, T scale_sum) {
......
......@@ -241,6 +241,11 @@ class MatrixBitCodeFunctor {
*/
void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec);
/* For selected rows For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void AddGrad(const framework::LoDTensor& tmat, framework::SelectedRows* vec);
/* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/
......
......@@ -4639,14 +4639,14 @@ def hsigmoid(input,
if not is_costum:
bias = helper.create_parameter(
attr=helper.bias_attr,
shape=[1, num_classes - 1],
shape=[num_classes - 1, 1],
is_bias=True,
dtype=input.dtype)
inputs['Bias'] = bias
else:
bias = helper.create_parameter(
attr=helper.bias_attr,
shape=[1, non_leaf_num],
shape=[non_leaf_num, 1],
is_bias=True,
dtype=input.dtype)
inputs['Bias'] = bias
......
......@@ -77,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length = code_table.get_length()
for j in range(length):
idx = code_table.cal_index(j)
pre_output[i][j] += bias[0][idx]
pre_output[i][j] += bias[idx][0]
for i in range(batch_size):
code_table = CodeTable(num_classes, label[i])
length = code_table.get_length()
......@@ -115,7 +115,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes):
length = code_table.get_length()
for j in range(length):
idx = code_table.cal_index(j)
pre_output[i][j] += bias[0][idx]
pre_output[i][j] += bias[idx][0]
for i in range(batch_size):
code_table = CodeTableWithCustomTree(ptable, pcode, i)
length = code_table.get_length()
......@@ -150,7 +150,7 @@ class TestHSigmoidOp(OpTest):
w = np.random.random(
(num_classes - 1, feature_size)).astype("float32") * 2
label = np.random.randint(0, num_classes, (batch_size, 1))
bias = np.random.random((1, num_classes - 1)).astype("float32")
bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': False}
self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
pre_output, out = hsigmoid(x, w, label, bias, num_classes)
......@@ -178,7 +178,7 @@ class TestHSigmoidOpSparse(OpTest):
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
bias = np.random.random((1, num_classes - 1)).astype("float32")
bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': True}
self.inputs = {
'X': x,
......@@ -193,7 +193,6 @@ class TestHSigmoidOpSparse(OpTest):
self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self):
print("checking output in CostumTree")
self.check_output()
......@@ -208,7 +207,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
emb = fluid.layers.embedding(
input=input_word,
is_sparse=False,
is_sparse=is_sparse,
size=[3, 3],
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(3))))
......@@ -220,6 +219,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
ptable=ptable,
pcode=pcode,
is_costum=True,
bias_attr=True,
is_sparse=is_sparse)
avg_cost = fluid.layers.reduce_mean(cost)
......@@ -240,7 +240,6 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
optimizer.minimize(loss)
main_program = fluid.default_main_program()
# print("main program: {program}".format{program=str(main_program)})
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
exe = fluid.Executor(place)
......@@ -279,7 +278,7 @@ class TestHSigmoidOpWithCostumTree(OpTest):
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
bias = np.random.random((1, num_classes - 1)).astype("float32")
bias = np.random.random((num_classes - 1, 1)).astype("float32")
self.attrs = {'num_classes': num_classes, 'is_sparse': False}
self.inputs = {
'X': x,
......@@ -294,11 +293,9 @@ class TestHSigmoidOpWithCostumTree(OpTest):
self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self):
print("checking output in CostumTree")
self.check_output()
def test_check_grad(self):
print("checking outputGrad in CostumTree")
self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册