From 3438d650edee11f3488994370a95ab11696d28d1 Mon Sep 17 00:00:00 2001 From: xuwei06 Date: Mon, 19 Jun 2017 23:41:49 -0700 Subject: [PATCH] Fix bugs for rnn generation 1. v2.layer.parse_network does not correctly handle the generation output. 2. GatherAgentLayer does not correctly handle generation output when batch_size > 1 3. Fix CustomStackTrace for rnn group --- .../gradientmachines/NeuralNetwork.cpp | 9 +-- .../RecurrentGradientMachine.cpp | 16 +++-- .../RecurrentGradientMachine.h | 1 + paddle/gserver/layers/AgentLayer.cpp | 69 +++++++++---------- .../tests/sample_trainer_nest_rnn_gen.conf | 8 +-- .../trainer/tests/sample_trainer_rnn_gen.conf | 6 +- paddle/utils/CustomStackTrace.h | 6 +- paddle/utils/tests/test_CustomStackTrace.cpp | 1 - python/paddle/v2/layer.py | 16 ++++- 9 files changed, 76 insertions(+), 56 deletions(-) diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index 4512aacc81f..a361d7deace 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -241,11 +241,14 @@ void NeuralNetwork::forward(const std::vector& inArgs, dataLayers_[i]->setData(inArgs[i]); } + gLayerStackTrace.set_stage(true); + { for (auto& layer : layers_) { REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str()); gLayerStackTrace.push(layer->getName()); layer->forward(passType); + gLayerStackTrace.pop(layer->getName()); } } @@ -254,9 +257,6 @@ void NeuralNetwork::forward(const std::vector& inArgs, for (auto& layer : outputLayers_) { outArgs->push_back(layer->getOutput()); } - if (passType == PASS_TEST) { - gLayerStackTrace.clear(); - } } void NeuralNetwork::resetState() { @@ -283,9 +283,10 @@ void NeuralNetwork::getState(MachineState& machineState) { } void NeuralNetwork::backward(const UpdateCallback& callback) { - gLayerStackTrace.pop(""); // tell layer trace is during backward. + gLayerStackTrace.set_stage(false); FOR_EACH_R(layer, layers_) { REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str()); + gLayerStackTrace.push((*layer)->getName()); if ((*layer)->needGradient()) { (*layer)->backward(callback); } diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 3e930380226..867c99ede3f 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -208,6 +208,7 @@ void RecurrentGradientMachine::init( }); CHECK(subModelConfig != config.sub_models().end()); reversed_ = subModelConfig->reversed(); + generating_ = subModelConfig->has_generator(); inFrameLines_.resize(subModelConfig->in_links_size()); for (size_t i = 0; i < inFrameLines_.size(); ++i) { @@ -538,7 +539,7 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, The outputs are outFramesLines_[i].agentLayer */ - if (inFrameLines_.empty() && passType == PASS_TEST) { + if (generating_) { generateSequence(); return; } // else forward.. @@ -569,6 +570,9 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, } void RecurrentGradientMachine::backward(const UpdateCallback& callback) { + if (generating_) { + return; + } REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime"); AsyncGpuBlock asyncGpuBlock; for (int i = maxSequenceLength_ - 1; i >= 0; --i) { @@ -1321,11 +1325,10 @@ void RecurrentGradientMachine::fillGenOutputs() { batchMachineIdVec_.clear(); generator_.ids.clear(); + int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false); + starts[0] = 0; if (numResults > 1) { real* probs = generator_.outArg.in->getData(); - int* starts = - generator_.outArg.sequenceStartPositions->getMutableData(false); - starts[0] = 0; for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t j = 0; j < finalPaths_[i].size(); ++j) { Path& path = finalPaths_[i][j]; @@ -1348,7 +1351,10 @@ void RecurrentGradientMachine::fillGenOutputs() { } else { for (size_t i = 0; i < finalPaths_.size(); ++i) { CHECK(!finalPaths_[i].empty()); - generator_.ids = finalPaths_[i][0].ids; + generator_.ids.insert(generator_.ids.begin(), + finalPaths_[i][0].ids.begin(), + finalPaths_[i][0].ids.end()); + starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size(); } } } diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index 8d94d7e2df2..8e30883ac72 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -414,6 +414,7 @@ protected: std::vector ids; // store generated sequences Argument outArg; // final output argument }; + bool generating_; Generator generator_; std::vector> frames_; diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 31463823b3f..512932d9a55 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -109,6 +109,40 @@ void GatherAgentLayer::forwardValue(PassType passType) { } } +namespace { + +// dest[index[i]] <- src[i] for each i +void copyElements(const IVector& srcVec, + const IVector& indexVec, + IVector& destVec) { + const int* src = srcVec.getData(); + const int* index = indexVec.getData(); + int* dest = destVec.getData(); + int len = indexVec.getSize(); + CHECK_EQ(srcVec.getSize(), indexVec.getSize()); + for (int i = 0; i < len; ++i) { + dest[index[i]] = src[i]; + } +} +} + +void GatherAgentLayer::forwardIds(PassType passType) { + IVectorPtr realId = realLayers_[0]->getOutputLabel(); + if (!realId) return; + + IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_); + IVectorPtr outId = output_.ids; + idsVec_.resize(idIndex_.size()); + + for (size_t i = 0; i < realLayers_.size(); ++i) { + const IVectorPtr& realId = realLayers_[i]->getOutputLabel(); + idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i], + /* size */ realId->getSize(), + useGpu_); + execViaCpu(©Elements, *realId, *idsVec_[i], *outId); + } +} + void GatherAgentLayer::backward(const UpdateCallback& callback) { (void)callback; const MatrixPtr& outputGrad = getOutputGrad(); @@ -174,41 +208,6 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) { REGISTER_LAYER(gather_agent, GatherAgentLayer); REGISTER_LAYER(scatter_agent, ScatterAgentLayer); -void GatherAgentLayer::forwardIds(PassType passType) { - int height = 0; - IVectorPtr idReal = realLayers_[0]->getOutputLabel(); - - if (!idReal) return; - - if (output_.subSequenceStartPositions) { - int* starts = output_.subSequenceStartPositions->getMutableData(false); - // Gather generator.idsVec - // if is beam search generation result. Get first result. - if (idReal->getData()[idReal->getSize() - 1] == -1) { - for (size_t i = 0; i < realLayers_.size(); ++i) { - // The first element stores first result size - idReal = realLayers_[i]->getOutputLabel(); - idReal->subVecFrom(*idReal, 1, idReal->getData()[0]); - } - } - for (size_t i = 0; i < realLayers_.size(); ++i) { - CHECK(realLayers_[i]->getOutputLabel()); - starts[i] = height; - height += realLayers_[i]->getOutputLabel()->getSize(); - } - starts[realLayers_.size()] = height; - output_.sequenceStartPositions->getMutableData(false)[1] = height; - - IVector::resizeOrCreate(output_.ids, height, false); - for (size_t i = 0; i < realLayers_.size(); ++i) { - output_.ids->subVec(starts[i], starts[i + 1] - starts[i]) - ->copyFrom(*realLayers_[i]->getOutputLabel()); - } - } else { - LOG(FATAL) << "Not implemented"; - } -} - void ScatterAgentLayer::forwardSequence(PassType passType) { Layer::forward(passType); CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf index d669fbc40cb..741a0aa71df 100644 --- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf +++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf @@ -35,7 +35,7 @@ def outer_step(dummy_data): embedding_size=num_words)] def inner_step(dummy_memory, predict_word): - + # simplified RNN for testing with mixed_layer(size=num_words) as layer: layer += full_matrix_projection(input=predict_word, @@ -46,15 +46,15 @@ def outer_step(dummy_data): param_attr=ParamAttr(name="wordvec")) return out - + beam_gen = beam_search(name="rnn_gen", step=inner_step, input=gen_inputs, bos_id=0, eos_id=num_words-1, beam_size=2 if beam_flag else 1, - num_results_per_sample=2 if beam_flag else 1, - max_length=10) + num_results_per_sample=1, + max_length=10) return beam_gen beam_gen_concat = recurrent_group(name="rnn_gen_concat", diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf index 2b337282f62..58d27f15ae1 100644 --- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf +++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf @@ -33,7 +33,7 @@ gen_inputs = [StaticInput(input=dummy_data, size=2), embedding_size=num_words)] def step(dummy_memory, predict_word): - + # simplified RNN for testing with mixed_layer(size=num_words) as layer: layer += full_matrix_projection(input=predict_word, @@ -44,7 +44,7 @@ def step(dummy_memory, predict_word): param_attr=ParamAttr(name="wordvec")) return out - + beam_gen = beam_search(name="rnn_gen", step=step, input=gen_inputs, @@ -52,7 +52,7 @@ beam_gen = beam_search(name="rnn_gen", eos_id=num_words-1, beam_size=2 if beam_flag else 1, num_results_per_sample=2 if beam_flag else 1, - max_length=10) + max_length=10) seqtext_printer_evaluator(input=beam_gen, id_input=sent_id, diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h index 6992e856223..52a6df94979 100644 --- a/paddle/utils/CustomStackTrace.h +++ b/paddle/utils/CustomStackTrace.h @@ -55,13 +55,17 @@ public: * Else, just set status to popping. */ void pop(const T& item) { - pushing() = false; auto& s = this->stack(); if (item == s.top()) { s.pop(); } } + /** + * @brief Indicate whether we are at forward or backward stage of computation + */ + void set_stage(bool isForward) { pushing() = isForward; } + /** * @brief clear current thread stack. */ diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp index b5d9f93f137..c320074fbad 100644 --- a/paddle/utils/tests/test_CustomStackTrace.cpp +++ b/paddle/utils/tests/test_CustomStackTrace.cpp @@ -72,7 +72,6 @@ TEST(CustomStackTrace, normalTrain) { for (size_t i = 0; i < layerSize; ++i) { tracer.push("layer_" + paddle::str::to_string(i)); } - tracer.pop(""); for (size_t i = 0; i < layerSize; ++i) { tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i)); } diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index bbb9c3ea8c1..4ade1c6f329 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -45,12 +45,12 @@ __all__ = ['data', 'parse_network'] def __need_to_keep__(name): return name in [ 'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType', - 'layer_support' + 'layer_support', 'BaseGeneratedInput' ] def __need_to_wrap__(name): - return name not in ['AggregateLevel', 'ExpandLevel'] + return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput'] def __convert_name__(inname): @@ -199,6 +199,15 @@ def __get_used_submodels__(layer_names): return submodel_names +def __get_submodel_data_out_links__(): + data_links = set() + for submodel in cp.g_config.model_config.sub_models: + for link in submodel.out_links: + if cp.g_layer_map[link.link_name].type == 'data': + data_links.add(link.link_name) + return data_links + + def __get_used_evaluators__(layer_names): evaluator_names = set() for e in cp.g_config.model_config.evaluators: @@ -264,6 +273,7 @@ def parse_network(output_layers, extra_layers=None): submodel_names = __get_used_submodels__(layer_names) submodel_names.add('root') evaluator_names = __get_used_evaluators__(layer_names) + data_out_links = __get_submodel_data_out_links__() input_layer_names = set() output_layer_names = set() @@ -279,7 +289,7 @@ def parse_network(output_layers, extra_layers=None): continue model_config.layers.extend([l]) if l.type == 'data': - if l.name in model_config.output_layer_names: + if l.name in data_out_links: """ In text generation, the outlink to save the generated word indices is a data_layer defined in recurrent_group. This -- GitLab