提交 6fa84d40 编写于 作者: C Cao Ying 提交者: GitHub

Merge pull request #2521 from emailweixu/fix_rnn_gen

Fix bugs for rnn generation.
...@@ -241,11 +241,14 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs, ...@@ -241,11 +241,14 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
dataLayers_[i]->setData(inArgs[i]); dataLayers_[i]->setData(inArgs[i]);
} }
gLayerStackTrace.set_stage(true);
{ {
for (auto& layer : layers_) { for (auto& layer : layers_) {
REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str()); REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
gLayerStackTrace.push(layer->getName()); gLayerStackTrace.push(layer->getName());
layer->forward(passType); layer->forward(passType);
gLayerStackTrace.pop(layer->getName());
} }
} }
...@@ -254,9 +257,6 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs, ...@@ -254,9 +257,6 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
for (auto& layer : outputLayers_) { for (auto& layer : outputLayers_) {
outArgs->push_back(layer->getOutput()); outArgs->push_back(layer->getOutput());
} }
if (passType == PASS_TEST) {
gLayerStackTrace.clear();
}
} }
void NeuralNetwork::resetState() { void NeuralNetwork::resetState() {
...@@ -283,9 +283,10 @@ void NeuralNetwork::getState(MachineState& machineState) { ...@@ -283,9 +283,10 @@ void NeuralNetwork::getState(MachineState& machineState) {
} }
void NeuralNetwork::backward(const UpdateCallback& callback) { void NeuralNetwork::backward(const UpdateCallback& callback) {
gLayerStackTrace.pop(""); // tell layer trace is during backward. gLayerStackTrace.set_stage(false);
FOR_EACH_R(layer, layers_) { FOR_EACH_R(layer, layers_) {
REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str()); REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
gLayerStackTrace.push((*layer)->getName());
if ((*layer)->needGradient()) { if ((*layer)->needGradient()) {
(*layer)->backward(callback); (*layer)->backward(callback);
} }
......
...@@ -208,6 +208,7 @@ void RecurrentGradientMachine::init( ...@@ -208,6 +208,7 @@ void RecurrentGradientMachine::init(
}); });
CHECK(subModelConfig != config.sub_models().end()); CHECK(subModelConfig != config.sub_models().end());
reversed_ = subModelConfig->reversed(); reversed_ = subModelConfig->reversed();
generating_ = subModelConfig->has_generator();
inFrameLines_.resize(subModelConfig->in_links_size()); inFrameLines_.resize(subModelConfig->in_links_size());
for (size_t i = 0; i < inFrameLines_.size(); ++i) { for (size_t i = 0; i < inFrameLines_.size(); ++i) {
...@@ -538,7 +539,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs, ...@@ -538,7 +539,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
The outputs are outFramesLines_[i].agentLayer The outputs are outFramesLines_[i].agentLayer
*/ */
if (inFrameLines_.empty() && passType == PASS_TEST) { if (generating_) {
generateSequence(); generateSequence();
return; return;
} // else forward.. } // else forward..
...@@ -569,6 +570,9 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs, ...@@ -569,6 +570,9 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
} }
void RecurrentGradientMachine::backward(const UpdateCallback& callback) { void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
if (generating_) {
return;
}
REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime"); REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
AsyncGpuBlock asyncGpuBlock; AsyncGpuBlock asyncGpuBlock;
for (int i = maxSequenceLength_ - 1; i >= 0; --i) { for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
...@@ -1321,11 +1325,10 @@ void RecurrentGradientMachine::fillGenOutputs() { ...@@ -1321,11 +1325,10 @@ void RecurrentGradientMachine::fillGenOutputs() {
batchMachineIdVec_.clear(); batchMachineIdVec_.clear();
generator_.ids.clear(); generator_.ids.clear();
int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
starts[0] = 0;
if (numResults > 1) { if (numResults > 1) {
real* probs = generator_.outArg.in->getData(); real* probs = generator_.outArg.in->getData();
int* starts =
generator_.outArg.sequenceStartPositions->getMutableData(false);
starts[0] = 0;
for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t i = 0; i < finalPaths_.size(); ++i) {
for (size_t j = 0; j < finalPaths_[i].size(); ++j) { for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
Path& path = finalPaths_[i][j]; Path& path = finalPaths_[i][j];
...@@ -1348,7 +1351,10 @@ void RecurrentGradientMachine::fillGenOutputs() { ...@@ -1348,7 +1351,10 @@ void RecurrentGradientMachine::fillGenOutputs() {
} else { } else {
for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t i = 0; i < finalPaths_.size(); ++i) {
CHECK(!finalPaths_[i].empty()); CHECK(!finalPaths_[i].empty());
generator_.ids = finalPaths_[i][0].ids; generator_.ids.insert(generator_.ids.begin(),
finalPaths_[i][0].ids.begin(),
finalPaths_[i][0].ids.end());
starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
} }
} }
} }
......
...@@ -414,6 +414,7 @@ protected: ...@@ -414,6 +414,7 @@ protected:
std::vector<int> ids; // store generated sequences std::vector<int> ids; // store generated sequences
Argument outArg; // final output argument Argument outArg; // final output argument
}; };
bool generating_;
Generator generator_; Generator generator_;
std::vector<std::unique_ptr<NeuralNetwork>> frames_; std::vector<std::unique_ptr<NeuralNetwork>> frames_;
......
...@@ -109,6 +109,40 @@ void GatherAgentLayer::forwardValue(PassType passType) { ...@@ -109,6 +109,40 @@ void GatherAgentLayer::forwardValue(PassType passType) {
} }
} }
namespace {
// dest[index[i]] <- src[i] for each i
void copyElements(const IVector& srcVec,
const IVector& indexVec,
IVector& destVec) {
const int* src = srcVec.getData();
const int* index = indexVec.getData();
int* dest = destVec.getData();
int len = indexVec.getSize();
CHECK_EQ(srcVec.getSize(), indexVec.getSize());
for (int i = 0; i < len; ++i) {
dest[index[i]] = src[i];
}
}
}
void GatherAgentLayer::forwardIds(PassType passType) {
IVectorPtr realId = realLayers_[0]->getOutputLabel();
if (!realId) return;
IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
IVectorPtr outId = output_.ids;
idsVec_.resize(idIndex_.size());
for (size_t i = 0; i < realLayers_.size(); ++i) {
const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
/* size */ realId->getSize(),
useGpu_);
execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
}
}
void GatherAgentLayer::backward(const UpdateCallback& callback) { void GatherAgentLayer::backward(const UpdateCallback& callback) {
(void)callback; (void)callback;
const MatrixPtr& outputGrad = getOutputGrad(); const MatrixPtr& outputGrad = getOutputGrad();
...@@ -136,23 +170,22 @@ void ScatterAgentLayer::forward(PassType passType) { ...@@ -136,23 +170,22 @@ void ScatterAgentLayer::forward(PassType passType) {
CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
int width = this->getSize(); int width = this->getSize();
if (realOutArg_.hasSeq()) { if (selectionMode_) {
forwardSequence(passType); forwardWithSelection(passType);
} else if (realOutArg_.value || realOutArg_.ids) { } else {
output_.subArgFrom( if (realOutArg_.hasSeq()) {
realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_); output_.subArgFrom(realOutArg_,
} else { // used in generation /* offset */ idIndex_,
if (realLayer_->getOutput().ids) { idSize_,
IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_); width,
output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_); useGpu_,
} /* trans */ false,
if (realLayer_->getOutput().value) { /* seqFlag */ true,
int height = ids_->getSize(); /* seqStart */ seqStartPosIndex_,
resetOutput(height, width); /* seqSize */ numSequences_);
} else {
const MatrixPtr& outV = getOutputValue(); output_.subArgFrom(
const MatrixPtr& realV = realLayer_->getOutputValue(); realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
outV->selectRows(*realV, *ids_);
} }
} }
} }
...@@ -160,6 +193,8 @@ void ScatterAgentLayer::forward(PassType passType) { ...@@ -160,6 +193,8 @@ void ScatterAgentLayer::forward(PassType passType) {
void ScatterAgentLayer::backward(const UpdateCallback& callback) { void ScatterAgentLayer::backward(const UpdateCallback& callback) {
(void)callback; (void)callback;
CHECK(!selectionMode_);
const MatrixPtr& outputGrad = realOutArg_.grad; const MatrixPtr& outputGrad = realOutArg_.grad;
const MatrixPtr& realGrad = realLayer_->getOutputGrad(); const MatrixPtr& realGrad = realLayer_->getOutputGrad();
if (realGrad) { if (realGrad) {
...@@ -174,42 +209,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) { ...@@ -174,42 +209,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
REGISTER_LAYER(gather_agent, GatherAgentLayer); REGISTER_LAYER(gather_agent, GatherAgentLayer);
REGISTER_LAYER(scatter_agent, ScatterAgentLayer); REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
void GatherAgentLayer::forwardIds(PassType passType) { void ScatterAgentLayer::forwardWithSelection(PassType passType) {
int height = 0;
IVectorPtr idReal = realLayers_[0]->getOutputLabel();
if (!idReal) return;
if (output_.subSequenceStartPositions) {
int* starts = output_.subSequenceStartPositions->getMutableData(false);
// Gather generator.idsVec
// if is beam search generation result. Get first result.
if (idReal->getData()[idReal->getSize() - 1] == -1) {
for (size_t i = 0; i < realLayers_.size(); ++i) {
// The first element stores first result size
idReal = realLayers_[i]->getOutputLabel();
idReal->subVecFrom(*idReal, 1, idReal->getData()[0]);
}
}
for (size_t i = 0; i < realLayers_.size(); ++i) {
CHECK(realLayers_[i]->getOutputLabel());
starts[i] = height;
height += realLayers_[i]->getOutputLabel()->getSize();
}
starts[realLayers_.size()] = height;
output_.sequenceStartPositions->getMutableData(false)[1] = height;
IVector::resizeOrCreate(output_.ids, height, false);
for (size_t i = 0; i < realLayers_.size(); ++i) {
output_.ids->subVec(starts[i], starts[i + 1] - starts[i])
->copyFrom(*realLayers_[i]->getOutputLabel());
}
} else {
LOG(FATAL) << "Not implemented";
}
}
void ScatterAgentLayer::forwardSequence(PassType passType) {
Layer::forward(passType); Layer::forward(passType);
CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
...@@ -220,17 +220,19 @@ void ScatterAgentLayer::forwardSequence(PassType passType) { ...@@ -220,17 +220,19 @@ void ScatterAgentLayer::forwardSequence(PassType passType) {
AsyncGpuBlock asyncGpuBlock; AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str()); REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
if (realOutArg_.value || realOutArg_.ids) { if (!input.hasSeq()) {
CHECK(realOutArg_.sequenceStartPositions); if (realLayer_->getOutput().ids) {
output_.subArgFrom(realOutArg_, IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
/* offset */ idIndex_, output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
idSize_, }
width, if (realLayer_->getOutput().value) {
useGpu_, int height = ids_->getSize();
/* trans */ false, resetOutput(height, width);
/* seqFlag */ true,
/* seqStart */ seqStartPosIndex_, const MatrixPtr& outV = getOutputValue();
/* seqSize */ numSequences_); const MatrixPtr& realV = realLayer_->getOutputValue();
outV->selectRows(*realV, *ids_);
}
} else { } else {
// Putting the generation logic here is really an ugly hack! // Putting the generation logic here is really an ugly hack!
// used in generation // used in generation
......
...@@ -110,6 +110,9 @@ protected: ...@@ -110,6 +110,9 @@ protected:
// of real layer. // of real layer.
ICpuGpuVectorPtr inputStartPos_; ICpuGpuVectorPtr inputStartPos_;
// true for setRealLayer, false for setRealLayerAndOutput
bool selectionMode_;
public: public:
explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {} explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
...@@ -137,6 +140,7 @@ public: ...@@ -137,6 +140,7 @@ public:
} else { } else {
cpuIds_ = ids_; cpuIds_ = ids_;
} }
selectionMode_ = true;
} }
// set real layer and output, [idIndex, idIndex + idSize) of *ids* // set real layer and output, [idIndex, idIndex + idSize) of *ids*
...@@ -153,6 +157,7 @@ public: ...@@ -153,6 +157,7 @@ public:
idIndex_ = idIndex; idIndex_ = idIndex;
idSize_ = idSize; idSize_ = idSize;
handleBackward_ = handleBackward; handleBackward_ = handleBackward;
selectionMode_ = false;
} }
void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions, void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
...@@ -166,7 +171,7 @@ public: ...@@ -166,7 +171,7 @@ public:
void forward(PassType passType) override; void forward(PassType passType) override;
void backward(const UpdateCallback& callback) override; void backward(const UpdateCallback& callback) override;
void forwardSequence(PassType passType); void forwardWithSelection(PassType passType);
}; };
} // namespace paddle } // namespace paddle
...@@ -35,7 +35,7 @@ def outer_step(dummy_data): ...@@ -35,7 +35,7 @@ def outer_step(dummy_data):
embedding_size=num_words)] embedding_size=num_words)]
def inner_step(dummy_memory, predict_word): def inner_step(dummy_memory, predict_word):
# simplified RNN for testing # simplified RNN for testing
with mixed_layer(size=num_words) as layer: with mixed_layer(size=num_words) as layer:
layer += full_matrix_projection(input=predict_word, layer += full_matrix_projection(input=predict_word,
...@@ -46,15 +46,15 @@ def outer_step(dummy_data): ...@@ -46,15 +46,15 @@ def outer_step(dummy_data):
param_attr=ParamAttr(name="wordvec")) param_attr=ParamAttr(name="wordvec"))
return out return out
beam_gen = beam_search(name="rnn_gen", beam_gen = beam_search(name="rnn_gen",
step=inner_step, step=inner_step,
input=gen_inputs, input=gen_inputs,
bos_id=0, bos_id=0,
eos_id=num_words-1, eos_id=num_words-1,
beam_size=2 if beam_flag else 1, beam_size=2 if beam_flag else 1,
num_results_per_sample=2 if beam_flag else 1, num_results_per_sample=1,
max_length=10) max_length=10)
return beam_gen return beam_gen
beam_gen_concat = recurrent_group(name="rnn_gen_concat", beam_gen_concat = recurrent_group(name="rnn_gen_concat",
......
...@@ -33,7 +33,7 @@ gen_inputs = [StaticInput(input=dummy_data, size=2), ...@@ -33,7 +33,7 @@ gen_inputs = [StaticInput(input=dummy_data, size=2),
embedding_size=num_words)] embedding_size=num_words)]
def step(dummy_memory, predict_word): def step(dummy_memory, predict_word):
# simplified RNN for testing # simplified RNN for testing
with mixed_layer(size=num_words) as layer: with mixed_layer(size=num_words) as layer:
layer += full_matrix_projection(input=predict_word, layer += full_matrix_projection(input=predict_word,
...@@ -44,7 +44,7 @@ def step(dummy_memory, predict_word): ...@@ -44,7 +44,7 @@ def step(dummy_memory, predict_word):
param_attr=ParamAttr(name="wordvec")) param_attr=ParamAttr(name="wordvec"))
return out return out
beam_gen = beam_search(name="rnn_gen", beam_gen = beam_search(name="rnn_gen",
step=step, step=step,
input=gen_inputs, input=gen_inputs,
...@@ -52,7 +52,7 @@ beam_gen = beam_search(name="rnn_gen", ...@@ -52,7 +52,7 @@ beam_gen = beam_search(name="rnn_gen",
eos_id=num_words-1, eos_id=num_words-1,
beam_size=2 if beam_flag else 1, beam_size=2 if beam_flag else 1,
num_results_per_sample=2 if beam_flag else 1, num_results_per_sample=2 if beam_flag else 1,
max_length=10) max_length=10)
seqtext_printer_evaluator(input=beam_gen, seqtext_printer_evaluator(input=beam_gen,
id_input=sent_id, id_input=sent_id,
......
...@@ -55,13 +55,17 @@ public: ...@@ -55,13 +55,17 @@ public:
* Else, just set status to popping. * Else, just set status to popping.
*/ */
void pop(const T& item) { void pop(const T& item) {
pushing() = false;
auto& s = this->stack(); auto& s = this->stack();
if (item == s.top()) { if (item == s.top()) {
s.pop(); s.pop();
} }
} }
/**
* @brief Indicate whether we are at forward or backward stage of computation
*/
void set_stage(bool isForward) { pushing() = isForward; }
/** /**
* @brief clear current thread stack. * @brief clear current thread stack.
*/ */
......
...@@ -72,7 +72,6 @@ TEST(CustomStackTrace, normalTrain) { ...@@ -72,7 +72,6 @@ TEST(CustomStackTrace, normalTrain) {
for (size_t i = 0; i < layerSize; ++i) { for (size_t i = 0; i < layerSize; ++i) {
tracer.push("layer_" + paddle::str::to_string(i)); tracer.push("layer_" + paddle::str::to_string(i));
} }
tracer.pop("");
for (size_t i = 0; i < layerSize; ++i) { for (size_t i = 0; i < layerSize; ++i) {
tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i)); tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
} }
......
...@@ -45,12 +45,12 @@ __all__ = ['data', 'parse_network'] ...@@ -45,12 +45,12 @@ __all__ = ['data', 'parse_network']
def __need_to_keep__(name): def __need_to_keep__(name):
return name in [ return name in [
'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType', 'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
'layer_support' 'layer_support', 'BaseGeneratedInput'
] ]
def __need_to_wrap__(name): def __need_to_wrap__(name):
return name not in ['AggregateLevel', 'ExpandLevel'] return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput']
def __convert_name__(inname): def __convert_name__(inname):
...@@ -199,6 +199,15 @@ def __get_used_submodels__(layer_names): ...@@ -199,6 +199,15 @@ def __get_used_submodels__(layer_names):
return submodel_names return submodel_names
def __get_submodel_data_out_links__():
data_links = set()
for submodel in cp.g_config.model_config.sub_models:
for link in submodel.out_links:
if cp.g_layer_map[link.link_name].type == 'data':
data_links.add(link.link_name)
return data_links
def __get_used_evaluators__(layer_names): def __get_used_evaluators__(layer_names):
evaluator_names = set() evaluator_names = set()
for e in cp.g_config.model_config.evaluators: for e in cp.g_config.model_config.evaluators:
...@@ -264,6 +273,7 @@ def parse_network(output_layers, extra_layers=None): ...@@ -264,6 +273,7 @@ def parse_network(output_layers, extra_layers=None):
submodel_names = __get_used_submodels__(layer_names) submodel_names = __get_used_submodels__(layer_names)
submodel_names.add('root') submodel_names.add('root')
evaluator_names = __get_used_evaluators__(layer_names) evaluator_names = __get_used_evaluators__(layer_names)
data_out_links = __get_submodel_data_out_links__()
input_layer_names = set() input_layer_names = set()
output_layer_names = set() output_layer_names = set()
...@@ -279,7 +289,7 @@ def parse_network(output_layers, extra_layers=None): ...@@ -279,7 +289,7 @@ def parse_network(output_layers, extra_layers=None):
continue continue
model_config.layers.extend([l]) model_config.layers.extend([l])
if l.type == 'data': if l.type == 'data':
if l.name in model_config.output_layer_names: if l.name in data_out_links:
""" """
In text generation, the outlink to save the generated word In text generation, the outlink to save the generated word
indices is a data_layer defined in recurrent_group. This indices is a data_layer defined in recurrent_group. This
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册