diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md index 9db5fb8e9a9f89b004bf71ddc064cd976c0d0bee..c4a9bbeeefca0e05c335dd60233691e8bac33015 100644 --- a/doc/design/ops/sequence_decoder.md +++ b/doc/design/ops/sequence_decoder.md @@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, let's call this format the **absolute-offset LoD** for clarity. -The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows +The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows ```python [[0, 3, 9] [0, 2, 3, 3, 3, 9]] @@ -119,7 +119,7 @@ def generate(): encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) decoder_input = pd.fc( act=pd.activation.Linear(), - input=[target_word, encoder_ctx], + input=[target_word, encoder_ctx_expanded], size=3 * decoder_dim) gru_out, cur_mem = pd.gru_step( decoder_input, mem=decoder_mem, size=decoder_dim) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 1382bfca19a674a404916a5c709276ce41219d2f..bd58c0a7f8161f6b45f2b500f3685e4028d97e96 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -116,8 +116,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - VLOG(3) << op->DebugStringEx(local_scope); + VLOG(4) << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); + VLOG(3) << op->DebugStringEx(local_scope); if (FLAGS_do_memory_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index b29f528f3f749efa3463125c774c2f4d4ebcbc7c..53b0d0fe083579da4f0bb600f292765aa2aa0d8a 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -107,9 +107,10 @@ LoD ToAbsOffset(const LoD &in) { // the lowest level stores relative offsets if (in.empty() || in.size() == 1) return in; LoD result = in; - for (int level = result.size() - 2; level >= 0; level--) { - for (auto &ele : result[level]) { - ele = result[level + 1][ele]; + for (auto level = static_cast(in.size() - 2); level >= 0; level--) { + for (size_t i = 0; i < in[level].size(); ++i) { + size_t index = in[level][i]; + result[level][i] = result[level + 1][index]; } } return result; diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc index 4c71d66d22899d2cf6418935bf9358a0f73cec27..844ade40eb2a7ae239b079daa609f03b9e7a06df 100644 --- a/paddle/operators/beam_search_op.cc +++ b/paddle/operators/beam_search_op.cc @@ -24,8 +24,18 @@ namespace operators { void BeamSearch::operator()(const framework::LoDTensor &pre_ids, framework::LoDTensor *selected_ids, framework::LoDTensor *selected_scores) { + auto abs_lod = framework::ToAbsOffset(ids_->lod()); + auto &high_level = abs_lod[lod_level_]; + auto items = SelectTopBeamSizeItems(); - auto selected_items = ToMap(items); + auto selected_items = ToMap(items, high_level.back()); + VLOG(3) << "selected_items:"; + for (size_t i = 0; i < selected_items.size(); ++i) { + VLOG(3) << "offset:" << i; + for (auto &item : selected_items[i]) { + VLOG(3) << ItemToString(item); + } + } PruneEndidCandidates(pre_ids, &selected_items); // calculate the output tensor's height size_t num_instances = std::accumulate( @@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, low_level.push_back(low_offset); // fill lod - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; framework::LoD lod(2); lod[0].assign(high_level.begin(), high_level.end()); lod[1].assign(low_level.begin(), low_level.end()); + if (!framework::CheckLoD(lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + } selected_ids->set_lod(lod); selected_scores->set_lod(lod); } @@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, } std::vector> BeamSearch::ToMap( - const std::vector> &items) { + const std::vector> &items, size_t element_num) { std::vector> result; + result.resize(element_num); for (auto &entries : items) { for (const auto &item : entries) { - if (item.offset >= result.size()) { - result.resize(item.offset + 1); - } result[item.offset].push_back(item); } } @@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() { } result.emplace_back(items); } + VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + for (auto &items : result) { + VLOG(3) << "item set:"; + for (auto &item : items) { + VLOG(3) << ItemToString(item); + } + } + return result; } @@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector *items) { return true; } +std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) { + os << "{"; + os << "offset: " << item.offset << ", "; + os << "id: " << item.id << ", "; + os << "score: " << item.score << ""; + os << "}"; + + return os; +} + +std::string ItemToString(const BeamSearch::Item &item) { + std::ostringstream stream; + stream << item; + return stream.str(); +} + class BeamSearchProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker { public: @@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker } }; +class BeamSearchInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + for (const std::string &arg : + std::vector({"pre_ids", "ids", "scores"})) { + PADDLE_ENFORCE(context->HasInput(arg), + "BeamSearch need input argument '%s'", arg); + } + for (const std::string &arg : + std::vector({"selected_ids", "selected_scores"})) { + PADDLE_ENFORCE(context->HasOutput(arg), + "BeamSearch need output argument '%s'", arg); + } + } +}; + +class BeamSearchInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o : op_desc.Output("selected_ids")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + for (auto &o : op_desc.Output("selected_scores")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp, - paddle::operators::BeamSearchProtoAndCheckerMaker); +REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp, + paddle::operators::BeamSearchProtoAndCheckerMaker, + paddle::operators::BeamSearchInferShape, + paddle::operators::BeamSearchInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h index 45d14d68fe8d1c4a84aa826e68e76692444765a8..7ad85874fcbd6ea48d688b32f2cc982d6b76d3c4 100644 --- a/paddle/operators/beam_search_op.h +++ b/paddle/operators/beam_search_op.h @@ -136,8 +136,6 @@ class BeamSearch { void operator()(const framework::LoDTensor& pre_ids, framework::LoDTensor* selected_ids, framework::LoDTensor* selected_scores); - - protected: /* * The basic items help to sort. */ @@ -155,6 +153,7 @@ class BeamSearch { score_t score; }; + protected: /* * Delete all the records that follows the end token. */ @@ -166,7 +165,7 @@ class BeamSearch { * NOTE low performance */ std::vector> ToMap( - const std::vector>& inputs); + const std::vector>& inputs, size_t element_num); /* * For each source, select top beam_size records. @@ -187,6 +186,10 @@ class BeamSearch { int end_id_{0}; }; +std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); + +std::string ItemToString(const BeamSearch::Item& item); + class BeamSearchOp : public framework::OperatorBase { public: BeamSearchOp(const std::string& type, @@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase { void Run(const framework::Scope& scope, const platform::Place& dev_place) const override { - LOG(INFO) << "run beam search op"; auto ids_var = scope.FindVar(Input("ids")); auto scores_var = scope.FindVar(Input("scores")); auto pre_ids_var = scope.FindVar(Input("pre_ids")); @@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase { size_t level = Attr("level"); size_t beam_size = Attr("beam_size"); int end_id = Attr("end_id"); - LOG(INFO) << "init beam search"; BeamSearch alg(ids, scores, level, beam_size, end_id); - LOG(INFO) << "after beam search"; auto selected_ids_var = scope.FindVar(Output("selected_ids")); auto selected_scores_var = scope.FindVar(Output("selected_scores")); PADDLE_ENFORCE_NOT_NULL(selected_ids_var); @@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase { *selected_ids_var->GetMutable(); auto& selected_scores_tensor = *selected_scores_var->GetMutable(); - LOG(INFO) << "run beam search"; alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); - LOG(INFO) << "finish beam search"; } }; diff --git a/paddle/operators/sequence_expand_op.h b/paddle/operators/sequence_expand_op.h index 2ba628e9c37278025e31779ab0468db46f2ff40a..6021526eee8e0a1f58885f6de38b14048787a828 100644 --- a/paddle/operators/sequence_expand_op.h +++ b/paddle/operators/sequence_expand_op.h @@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel { const T* x_data = x->data(); auto x_dims = x->dims(); auto* y = context.Input("Y"); + PADDLE_ENFORCE(!y->lod().empty(), "y should have lod"); PADDLE_ENFORCE_EQ(static_cast(x_dims[0]), y->lod().back().size() - 1, "The size of last lod level in Input(Y)" diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h index e9cd9bbd4d964c28f305fb4ab4c4733ed27ebfff..bf42e15e6b234125d9ec24e8500367b9915213ab 100644 --- a/paddle/operators/top_k_op.h +++ b/paddle/operators/top_k_op.h @@ -22,6 +22,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; template @@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { // Get the top k elements of each row of input tensor // FIXME: only deal with matrix(2d tensor). - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); // k is determined by Attr const size_t k = static_cast(ctx.Attr("k")); diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py index 0b0064ade90d2b70dd1458cb4d20d741fbf1efcd..8c481444e9bf895d29ed4e4952e825c2eaafc915 100644 --- a/python/paddle/v2/fluid/layer_helper.py +++ b/python/paddle/v2/fluid/layer_helper.py @@ -100,7 +100,8 @@ class LayerHelper(object): if dtype is None: dtype = each.dtype elif dtype != each.dtype: - raise ValueError("Data Type mismatch") + raise ValueError("Data Type mismatch: %d to %d" % + (dtype, each.dtype)) return dtype def create_parameter(self, diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 5f01fdb076d3bf7d060a805d1431f4973993a843..ed627dac26d286214f27ea43e887c52c394e841d 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -769,7 +769,7 @@ def topk(input, k): array = fluid.layers.topk(x, k) """ helper = LayerHelper('topk', **locals()) - topk_out = helper.create_tmp_variable(dtype=input.data_type) + topk_out = helper.create_tmp_variable(dtype=input.dtype) topk_indices = helper.create_tmp_variable(dtype='int64') helper.append_op( type='top_k', diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 7b9fb024fb2f7b5c7892354bf69138cc83ed1f2c..efad649078de911295a06f52338bb4357cf228cc 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -61,6 +61,7 @@ __all__ = [ 'transpose', 'im2sequence', 'nce', + 'beam_search', ] @@ -163,10 +164,8 @@ def fc(input, tmp = helper.create_tmp_variable(dtype) helper.append_op( type="mul", - inputs={ - "X": input_var, - "Y": w, - }, + inputs={"X": input_var, + "Y": w}, outputs={"Out": tmp}, attrs={"x_num_col_dims": num_flatten_dims, "y_num_col_dims": 1}) @@ -1551,6 +1550,38 @@ def sequence_expand(x, y, name=None): return tmp +def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): + ''' + This function implements the beam search algorithm. + ''' + helper = LayerHelper('beam_search', **locals()) + score_type = scores.dtype + id_type = ids.dtype + + selected_scores = helper.create_tmp_variable(dtype=score_type) + selected_ids = helper.create_tmp_variable(dtype=id_type) + + helper.append_op( + type='beam_search', + inputs={ + 'pre_ids': pre_ids, + 'ids': ids, + 'scores': scores, + }, + outputs={ + 'selected_ids': selected_ids, + 'selected_scores': selected_scores, + }, + attrs={ + # TODO(ChunweiYan) to assure other value support + 'level': level, + 'beam_size': beam_size, + 'end_id': end_id, + }) + + return selected_ids, selected_scores + + def lstm_unit(x_t, hidden_t_prev, cell_t_prev, diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py index 53ae200a2387712c63ab67f44d4e9da03ebbe4b2..82b760d693560dae1ab1fa39afdc186f60423e65 100644 --- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -17,7 +17,7 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework -import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.layers as pd from paddle.v2.fluid.executor import Executor dict_size = 30000 @@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) hidden_dim = 32 word_dim = 16 IS_SPARSE = True -batch_size = 10 -max_length = 50 +batch_size = 2 +max_length = 8 topk_size = 50 trg_dic_size = 10000 +beam_size = 2 decoder_size = hidden_dim +place = core.CPUPlace() -def encoder_decoder(): + +def encoder(): # encoder - src_word_id = layers.data( + src_word_id = pd.data( name="src_word_id", shape=[1], dtype='int64', lod_level=1) - src_embedding = layers.embedding( + src_embedding = pd.embedding( input=src_word_id, size=[dict_size, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name='vemb')) - fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') - lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) - encoder_out = layers.sequence_last_step(input=lstm_hidden0) + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + +def decoder_train(context): # decoder - trg_language_word = layers.data( + trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) - trg_embedding = layers.embedding( + trg_embedding = pd.embedding( input=trg_language_word, size=[dict_size, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name='vemb')) - rnn = fluid.layers.DynamicRNN() + rnn = pd.DynamicRNN() with rnn.block(): current_word = rnn.step_input(trg_embedding) - mem = rnn.memory(init=encoder_out) - fc1 = fluid.layers.fc(input=[current_word, mem], + pre_state = rnn.memory(init=context) + current_state = pd.fc(input=[current_word, pre_state], size=decoder_size, act='tanh') - out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax') - rnn.update_memory(mem, fc1) - rnn.output(out) + + current_score = pd.fc(input=current_state, + size=target_dict_dim, + act='softmax') + rnn.update_memory(pre_state, current_state) + rnn.output(current_score) return rnn() +def decoder_decode(context): + init_state = context + array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) + counter = pd.zeros(shape=[1], dtype='int64') + + # fill the first element with init_state + state_array = pd.create_array('float32') + pd.array_write(init_state, array=state_array, i=counter) + + # ids, scores as memory + ids_array = pd.create_array('int64') + scores_array = pd.create_array('float32') + + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + pd.array_write(init_ids, array=ids_array, i=counter) + pd.array_write(init_scores, array=scores_array, i=counter) + + cond = pd.less_than(x=counter, y=array_len) + + while_op = pd.While(cond=cond) + with while_op.block(): + pre_ids = pd.array_read(array=ids_array, i=counter) + pre_state = pd.array_read(array=state_array, i=counter) + pre_score = pd.array_read(array=scores_array, i=counter) + + # expand the lod of pre_state to be the same with pre_score + pre_state_expanded = pd.sequence_expand(pre_state, pre_score) + + pre_ids_emb = pd.embedding( + input=pre_ids, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + # use rnn unit to update rnn + current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded], + size=decoder_size, + act='tanh') + + # use score to do beam search + current_score = pd.fc(input=current_state, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = pd.topk(current_score, k=50) + selected_ids, selected_scores = pd.beam_search( + pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + + pd.increment(x=counter, value=1, in_place=True) + + # update the memories + pd.array_write(current_state, array=state_array, i=counter) + pd.array_write(selected_ids, array=ids_array, i=counter) + pd.array_write(selected_scores, array=scores_array, i=counter) + + pd.less_than(x=counter, y=array_len, cond=cond) + + translation_ids, translation_scores = pd.beam_search_decode( + ids=ids_array, scores=scores_array) + + # return init_ids, init_scores + + return translation_ids, translation_scores + + +def set_init_lod(data, lod, place): + res = core.LoDTensor() + res.set(data, place) + res.set_lod(lod) + return res + + def to_lodtensor(data, place): seq_lens = [len(seq) for seq in data] cur_len = 0 @@ -88,12 +171,13 @@ def to_lodtensor(data, place): return res -def main(): - rnn_out = encoder_decoder() - label = layers.data( +def train_main(): + context = encoder() + rnn_out = decoder_train(context) + label = pd.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) - cost = layers.cross_entropy(input=rnn_out, label=label) - avg_cost = fluid.layers.mean(x=cost) + cost = pd.cross_entropy(input=rnn_out, label=label) + avg_cost = pd.mean(x=cost) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) @@ -103,13 +187,12 @@ def main(): paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) - place = core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) batch_id = 0 - for pass_id in xrange(2): + for pass_id in xrange(1): for data in train_data(): word_data = to_lodtensor(map(lambda x: x[0], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place) @@ -125,9 +208,48 @@ def main(): print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) if batch_id > 3: - exit(0) + break batch_id += 1 +def decode_main(): + context = encoder() + translation_ids, translation_scores = decoder_decode(context) + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [i for i in range(batch_size)] + [batch_size] + init_lod = [init_lod, init_lod] + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + for _, data in enumerate(train_data()): + init_ids = set_init_lod(init_ids_data, init_lod, place) + init_scores = set_init_lod(init_scores_data, init_lod, place) + + src_word_data = to_lodtensor(map(lambda x: x[0], data), place) + + result_ids, result_scores = exe.run( + framework.default_main_program(), + feed={ + 'src_word_id': src_word_data, + 'init_ids': init_ids, + 'init_scores': init_scores + }, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + print result_ids.lod() + break + + if __name__ == '__main__': - main() + # train_main() + decode_main()