diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 01158d1dce8d711c67b1ecf29bb644e42ccf6ff5..3e930380226bce58cc90704b4c4cfa36e9f70968 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -214,7 +214,6 @@ void RecurrentGradientMachine::init( inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name(); inFrameLines_[i].inLayer = rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name()); - inFrameLines_[i].hasSubseq = subModelConfig->in_links(i).has_subseq(); } outFrameLines_.resize(subModelConfig->out_links_size()); @@ -241,11 +240,8 @@ void RecurrentGradientMachine::init( rootNetwork_->getLayer(memoryConfig.boot_layer_name()); LayerConfig scatterConfig = *agentConfig; - memoryFrameLines_[i].is_sequence = memoryConfig.is_sequence(); memoryFrameLines_[i].rootAgent.reset( - memoryConfig.is_sequence() - ? new SequenceScatterAgentLayer(scatterConfig) - : new ScatterAgentLayer(scatterConfig)); + new ScatterAgentLayer(scatterConfig)); memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_); memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent; @@ -267,9 +263,7 @@ void RecurrentGradientMachine::init( if (subModelConfig->has_generator()) { memoryFrameLines_[i].scatterAgents.resize(2); for (auto& agent : memoryFrameLines_[i].scatterAgents) { - agent.reset(memoryConfig.is_sequence() - ? new SequenceScatterAgentLayer(*agentConfig) - : new ScatterAgentLayer(*agentConfig)); + agent.reset(new ScatterAgentLayer(*agentConfig)); agent->init(LayerMap(), parameterMap_); } } @@ -297,8 +291,6 @@ void RecurrentGradientMachine::init( if (subModelConfig->evaluator_names_size() > 0) { evaluator_.reset(frames_[0]->makeEvaluator()); } - - targetInfoInlinkId_ = subModelConfig->target_inlinkid(); } void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) { @@ -376,108 +368,102 @@ void RecurrentGradientMachine::prefetch(const std::vector& inArgs) { LOG(FATAL) << "should not use this function"; } -void RecurrentGradientMachine::forward(const std::vector& inArgs, - std::vector* outArgs, - PassType passType) { - if (inFrameLines_.empty() && passType == PASS_TEST) { - generateSequence(); - return; - } // else forward.. - - const Argument& input = inFrameLines_[0].inLayer->getOutput(); - CHECK(input.sequenceStartPositions); - int batchSize = input.getBatchSize(); - size_t numSequences = input.getNumSequences(); - const int* starts = input.sequenceStartPositions->getData(false); - bool hasSubseq = input.hasSubseq(); - - // In case of !hasSubseq or targetInfoInlinkId_ == -1, all inlinks share the - // same inframe info - bool shareInlinkInfo = !hasSubseq || targetInfoInlinkId_ == -1; - - // Defaultly, share info with the first inlink - if (shareInlinkInfo) { - targetInfoInlinkId_ = 0; - } - - // check hasSubseq in both config and input are the same - CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq); - - CHECK_EQ(starts[numSequences], batchSize); - CHECK(input.sequenceStartPositions); - - // check other inputs has same sequence length and start - for (size_t i = 1; i < inFrameLines_.size(); ++i) { - const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); - CHECK_EQ((size_t)input1.getNumSequences(), numSequences); - // check all inputs should have same hasSubseq flag - CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq); - - // if shareInlinkInfo, checks: - // 1. all inlinks have same number of total tokens - // 2. all inlinks have same number of tokens for each sentence of each - // sample. If hasSubseq, one sample has multiple sentence, else, one - // sample is one sentence - if (shareInlinkInfo) { - CHECK_EQ(input1.getBatchSize(), batchSize); - CHECK(std::equal(starts, - starts + numSequences + 1, - input1.sequenceStartPositions->getData(false))); +void RecurrentGradientMachine::checkInputConsistency( + int inlinkId, const std::vector& seqInfo) { + if (commonSeqInfo_.empty()) { + commonSeqInfo_.resize(seqInfo.size()); + for (size_t i = 0; i < seqInfo.size(); ++i) { + commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength; + commonSeqInfo_[i].seqId = seqInfo[i].seqId; + } + } else { + CHECK_EQ(commonSeqInfo_.size(), seqInfo.size()) + << " RecurrentGroup " << subModelName_ << " input " << inlinkId + << " has mismatched number of sequences"; + for (size_t i = 0; i < seqInfo.size(); ++i) { + CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength) + << " RecurrentGroup " << subModelName_ << " input " << inlinkId + << " has mismatched sequence length"; + CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId) + << " RecurrentGroup " << subModelName_ << " input " << inlinkId + << " has mismatched sequence length"; } } +} - if (hasSubseq) { - CHECK(input.subSequenceStartPositions); - size_t numSubSequences = input.getNumSubSequences(); - const int* subStarts = input.subSequenceStartPositions->getData(false); - CHECK_EQ(subStarts[numSubSequences], batchSize); - // if hasSubseq, check other inputs has same sub-sequence and sub-start - for (size_t i = 1; i < inFrameLines_.size(); ++i) { - const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); - CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences); - if (shareInlinkInfo) { - CHECK(std::equal(subStarts, - subStarts + numSubSequences + 1, - input1.subSequenceStartPositions->getData(false))); - } +void RecurrentGradientMachine::calcNumSequencesAtEachStep() { + int numSequences = commonSeqInfo_.size(); + numSeqs_.resize(maxSequenceLength_); + for (int i = 0; i < numSequences; ++i) { + for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) { + numSeqs_[j] = i + 1; } } +} +void RecurrentGradientMachine::reorganizeInput(PassType passType) { info_.clear(); info_.resize(inFrameLines_.size()); + commonSeqInfo_.clear(); seqInfos_.clear(); seqInfos_.resize(inFrameLines_.size()); + for (size_t i = 0; i < inFrameLines_.size(); i++) { + const Argument& input = inFrameLines_[i].inLayer->getOutput(); + if (!input.hasSeq()) { + continue; + } + input.getSeqInfo(&seqInfos_[i]); + checkInputConsistency(i, seqInfos_[i]); + } + CHECK(!commonSeqInfo_.empty()) + << "At least one input needs to be sequence or subsequence"; + maxSequenceLength_ = commonSeqInfo_[0].topLevelLength; + + calcNumSequencesAtEachStep(); + + for (size_t i = 0; i < inFrameLines_.size(); ++i) { + const Argument& input = inFrameLines_[i].inLayer->getOutput(); + if (!input.hasSeq()) { + seqInfos_[i] = commonSeqInfo_; + } + createInFrameInfo(i, input, passType); + } + { AsyncGpuBlock asyncGpuBlock; - // if shareInlinkInfo, only calculate info of the first inlink - // else, calculate info for each inlink - if (shareInlinkInfo) { - input.getSeqInfo(&seqInfos_[0]); - maxSequenceLength_ = seqInfos_[0][0].topLevelLength; - createInFrameInfo(0, input, passType); - } else { - for (size_t i = 0; i < inFrameLines_.size(); i++) { - const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); - input1.getSeqInfo(&seqInfos_[i]); - maxSequenceLength_ = seqInfos_[i][0].topLevelLength; - createInFrameInfo(i, input1, passType); - } - } // inFrameLine select rows in real layer one time for (size_t i = 0; i < inFrameLines_.size(); i++) { - int curInlinkId = shareInlinkInfo ? 0 : i; selectRowsOneTime(inFrameLines_[i].inLayer, - info_[curInlinkId].allIds, + info_[i].allIds, &(inFrameLines_[i].outArg), passType); } } - resizeOrCreateFrames(maxSequenceLength_); - resizeBootFrame(numSequences); +} + +void RecurrentGradientMachine::reorganizeOutput(PassType passType) { + calcSequenceStartPositions(); + for (size_t i = 0; i < outFrameLines_.size(); ++i) { + Info info; + auto& outFrameLine = outFrameLines_[i]; + ICpuGpuVectorPtr sequenceStartPositions; + ICpuGpuVectorPtr subSequenceStartPositions; + createOutFrameInfo( + outFrameLine, info, sequenceStartPositions, subSequenceStartPositions); + auto gatherAgent = + dynamic_cast(outFrameLine.agentLayer.get()); + CHECK_NOTNULL(gatherAgent); + gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions, + subSequenceStartPositions, + info.allIds, + info.idIndex); + } +} +void RecurrentGradientMachine::connectFrames(PassType passType) { for (auto& memoryFrameLine : memoryFrameLines_) { if (memoryFrameLine.rootAgent) { auto scatterAgent = @@ -487,8 +473,9 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, memoryFrameLine.outArg, memoryFrameLine.allIds, /* idIndex */ 0, - memoryFrameLine.allIds->getSize()); - if (memoryFrameLine.is_sequence) { // memoryConfig is sequence + memoryFrameLine.allIds->getSize(), + /* handleBackward */ true); + if (memoryFrameLine.sequenceStartPositions) { int size = memoryFrameLine.sequenceStartPositions->getSize(); scatterAgent->setSequenceStartPositions( memoryFrameLine.sequenceStartPositions, @@ -501,28 +488,26 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, for (auto& outFrameLine : outFrameLines_) { auto gatherAgent = dynamic_cast(outFrameLine.agentLayer.get()); - CHECK_NOTNULL(gatherAgent); - gatherAgent->copyIdAndSequenceInfo(input, - info_[targetInfoInlinkId_].allIds, - info_[targetInfoInlinkId_].idIndex); + gatherAgent->clearRealLayers(); } - for (int i = 0; i < maxSequenceLength_; ++i) { - int idSize = 0; // connect in_links for (size_t j = 0; j < inFrameLines_.size(); ++j) { - Info& info = info_[shareInlinkInfo ? 0 : j]; + Info& info = info_[j]; // idSize denotes the sum number of tokens in each length i - idSize = info.idIndex[i + 1] - info.idIndex[i]; + int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i]; + int idSize = info.idIndex.empty() ? numSeqs_[i] + : info.idIndex[i + 1] - info.idIndex[i]; InFrameLine inFrameLine = inFrameLines_[j]; auto scatterAgent = dynamic_cast(inFrameLine.agents[i].get()); scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer, inFrameLine.outArg, info.allIds, - info.idIndex[i], - idSize); - if (hasSubseq) { + idIndex, + idSize, + i == 0); + if (info.sequenceStartPositions) { // size: the length of subsequence int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i]; scatterAgent->setSequenceStartPositions( @@ -536,11 +521,6 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, dynamic_cast(outFrameLine.agentLayer.get()); gatherAgent->addRealLayer(outFrameLine.frames[i]); } - // connect memory links - // Adopt info_[0].idIndex because seq which has_subseq=True - // doesn't support Memory with !hasSubseq bootlayer; - // And inlinks that !hasSubSeq must have same inlink length. - idSize = info_[0].idIndex[i + 1] - info_[0].idIndex[i]; for (auto& memoryFrameLine : memoryFrameLines_) { NeuralNetwork::connect( memoryFrameLine.agents[i], @@ -548,6 +528,28 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, numSeqs_[i] /*height of agent*/); } } +} + +void RecurrentGradientMachine::forward(const std::vector& inArgs, + std::vector* outArgs, + PassType passType) { + /* inArgs and outArgs are not used. + The inputs are inFrameLines_[i].inLayer. + The outputs are outFramesLines_[i].agentLayer + */ + + if (inFrameLines_.empty() && passType == PASS_TEST) { + generateSequence(); + return; + } // else forward.. + + reorganizeInput(passType); + int numSequences = commonSeqInfo_.size(); + + resizeOrCreateFrames(maxSequenceLength_); + resizeBootFrame(numSequences); + + connectFrames(passType); REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime"); // forward @@ -558,16 +560,12 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, const std::vector inArgs; std::vector outArgs; frames_[i]->forward(inArgs, &outArgs, passType); - if (hasSubseq) { - for (auto& outFrameLine : outFrameLines_) { - CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions) - << "In hierachical RNN, all out links should be from sequences."; - } - } } if (evaluator_ && passType == PASS_TEST) { this->eval(evaluator_.get()); } + + reorganizeOutput(passType); } void RecurrentGradientMachine::backward(const UpdateCallback& callback) { @@ -634,76 +632,228 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() { this->beamSearchStatistics_ = nullptr; } } + +namespace { +void lenToStarts(std::vector& starts) { + int pos = 0; + starts.back() = 0; + for (auto& start : starts) { + int tmp = start; + start = pos; + pos += tmp; + } + starts.back() = pos; +} +} + +void RecurrentGradientMachine::calcSequenceStartPositions() { + std::vector starts(commonSeqInfo_.size() + 1); + for (auto& seqInfo : commonSeqInfo_) { + starts[seqInfo.seqId] = seqInfo.topLevelLength; + } + lenToStarts(starts); + ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false); + std::copy(starts.begin(), + starts.end(), + sequenceStartPositions_->getMutableData(false)); +} + +void RecurrentGradientMachine::checkOutputConsistency( + OutFrameLine& outFrameLine) { + bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq(); + for (int i = 0; i < maxSequenceLength_; ++i) { + LayerPtr frame = outFrameLine.frames[i]; + CHECK_EQ(hasSeq, frame->getOutput().hasSeq()); + int numSequences = frame->getOutput().getNumSequences(); + CHECK_EQ(numSeqs_[i], numSequences); + } +} + +void RecurrentGradientMachine::createOutFrameInfo( + OutFrameLine& outFrameLine, + Info& info, + ICpuGpuVectorPtr& sequenceStartPositions, + ICpuGpuVectorPtr& subSequenceStartPositions) { + checkOutputConsistency(outFrameLine); + + if (!outFrameLine.frames[0]->getOutput().hasSeq()) { + createOutFrameInfo_seq( + outFrameLine, info, sequenceStartPositions, subSequenceStartPositions); + } else { + createOutFrameInfo_subseq( + outFrameLine, info, sequenceStartPositions, subSequenceStartPositions); + } +} + +void RecurrentGradientMachine::createOutFrameInfo_seq( + OutFrameLine& outFrameLine, + Info& info, + ICpuGpuVectorPtr& sequenceStartPositions, + ICpuGpuVectorPtr& subSequenceStartPositions) { + std::vector allIds; + info.idIndex.resize(1, 0); // first idIndex = 0 + + const int* starts = sequenceStartPositions_->getData(false); + + for (int i = 0; i < maxSequenceLength_; ++i) { + LayerPtr frame = outFrameLine.frames[i]; + size_t numSequences = frame->getOutput().getNumSequences(); + for (size_t j = 0; j < numSequences; ++j) { + int seqStart = starts[commonSeqInfo_[j].seqId]; + int seqLength = commonSeqInfo_[j].topLevelLength; + allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i) + : (seqStart + i)); + } + info.idIndex.push_back(allIds.size()); + } + sequenceStartPositions = sequenceStartPositions_; + copyScattedId(allIds, &info.allIds, allIds.size()); + CHECK_EQ(info.idIndex.size(), static_cast(maxSequenceLength_ + 1)); +} + +void RecurrentGradientMachine::createOutFrameInfo_subseq( + OutFrameLine& outFrameLine, + Info& info, + ICpuGpuVectorPtr& sequenceStartPositions, + ICpuGpuVectorPtr& subSequenceStartPositions) { + size_t numSequences = commonSeqInfo_.size(); + std::vector allIds; + info.idIndex.resize(1, 0); // first idIndex = 0 + + const int* starts = sequenceStartPositions_->getData(false); + std::vector subStarts(starts[numSequences] + 1); + for (int i = 0; i < maxSequenceLength_; ++i) { + LayerPtr frame = outFrameLine.frames[i]; + size_t numSequences = frame->getOutput().getNumSequences(); + const int* seqStarts = + frame->getOutput().sequenceStartPositions->getData(false); + for (size_t j = 0; j < numSequences; ++j) { + subStarts[starts[commonSeqInfo_[j].seqId] + i] = + seqStarts[j + 1] - seqStarts[j]; + } + } + lenToStarts(subStarts); + + for (int i = 0; i < maxSequenceLength_; ++i) { + LayerPtr frame = outFrameLine.frames[i]; + size_t numSequences = frame->getOutput().getNumSequences(); + for (size_t j = 0; j < numSequences; ++j) { + int pos = starts[commonSeqInfo_[j].seqId] + i; + int subSeqStart = subStarts[pos]; + int subSeqEnd = subStarts[pos + 1]; + for (int k = subSeqStart; k < subSeqEnd; ++k) { + allIds.push_back(k); + } + } + info.idIndex.push_back(allIds.size()); + } + + ICpuGpuVector::resizeOrCreate( + subSequenceStartPositions, subStarts.size(), false); + int* cpuSubSequenceStartPositions = + subSequenceStartPositions->getMutableData(false); + std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions); + ICpuGpuVector::resizeOrCreate( + sequenceStartPositions, numSequences + 1, false); + int* cpuSequenceStartPositions = + sequenceStartPositions->getMutableData(false); + for (size_t i = 0; i <= numSequences; ++i) { + cpuSequenceStartPositions[i] = subStarts[starts[i]]; + } + copyScattedId(allIds, &info.allIds, allIds.size()); + CHECK_EQ(info.idIndex.size(), static_cast(maxSequenceLength_ + 1)); +} + /* create scattered id infomation for all realLayer of inFrameLines one time. * If hasSubseq, will also create scattered sequenceStartPositions infomation * for all realLayer of inFrameLines one time. */ - void RecurrentGradientMachine::createInFrameInfo(int inlinkId, const Argument& input, PassType passType) { - bool hasSubseq = input.hasSubseq(); - // numSequences: # samples(sequences) in a batch - size_t numSequences = input.getNumSequences(); + if (!input.hasSeq()) { + createInFrameInfo_nonseq(inlinkId, input, passType); + } else if (!input.hasSubseq()) { + createInFrameInfo_seq(inlinkId, input, passType); + } else { + createInFrameInfo_subseq(inlinkId, input, passType); + } +} + +void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId, + const Argument& input, + PassType passType) { std::vector allIds; auto& seqInfo = seqInfos_[inlinkId]; - - numSeqs_.clear(); Info* inlinkInfo = &info_[inlinkId]; inlinkInfo->idIndex.clear(); - inlinkInfo->idIndex.push_back(0); // first idIndex = 0 + for (size_t i = 0; i < seqInfo.size(); ++i) { + allIds.push_back(seqInfo[i].seqId); + } + // copy and check scatterId + copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize()); +} +void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId, + const Argument& input, + PassType passType) { + std::vector allIds; + auto& seqInfo = seqInfos_[inlinkId]; + Info* inlinkInfo = &info_[inlinkId]; + inlinkInfo->idIndex.resize(1, 0); // first idIndex = 0 + + for (int i = 0; i < maxSequenceLength_; ++i) { + for (int j = 0; j < numSeqs_[i]; ++j) { + int seqLength = seqInfo[j].topLevelLength; + int seqStart = seqInfo[j].seqStart; + allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i) + : (seqStart + i)); + } + inlinkInfo->idIndex.push_back(allIds.size()); + } + + // copy and check scatterId + copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize()); + CHECK_EQ(inlinkInfo->idIndex.size(), + static_cast(maxSequenceLength_ + 1)); +} +void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId, + const Argument& input, + PassType passType) { + std::vector allIds; + + auto& seqInfo = seqInfos_[inlinkId]; + + Info* inlinkInfo = &info_[inlinkId]; + inlinkInfo->idIndex.resize(1, 0); // first idIndex = 0 std::vector sequenceStartPositions; const int* subSequenceStartPositions = nullptr; - if (hasSubseq) { // for sequenceScatterAgentLayer - subSequenceStartPositions = input.subSequenceStartPositions->getData(false); - inlinkInfo->seqStartPosIndex.clear(); - inlinkInfo->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 - } - // maxSequenceLength_: max topLevelLength in allsamples + subSequenceStartPositions = input.subSequenceStartPositions->getData(false); + inlinkInfo->seqStartPosIndex.clear(); + inlinkInfo->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 for (int i = 0; i < maxSequenceLength_; ++i) { - if (hasSubseq) { - sequenceStartPositions.push_back(0); // first element = 0 - } - int numSeqs = 0; - for (size_t j = 0; j < numSequences; ++j) { - int seqLength = seqInfo[j].topLevelLength; - if (i >= seqLength) { - break; - } - ++numSeqs; - if (hasSubseq) { - int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i]; - int subSeqEnd = - subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1]; - for (int k = subSeqStart; k < subSeqEnd; ++k) { - allIds.push_back(k); - } - sequenceStartPositions.push_back(sequenceStartPositions.back() + - subSeqEnd - subSeqStart); - } else { - int seqStart = seqInfo[j].seqStart; - allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i) - : (seqStart + i)); + sequenceStartPositions.push_back(0); // first element = 0 + for (int j = 0; j < numSeqs_[i]; ++j) { + int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i]; + int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1]; + for (int k = subSeqStart; k < subSeqEnd; ++k) { + allIds.push_back(k); } + sequenceStartPositions.push_back(sequenceStartPositions.back() + + subSeqEnd - subSeqStart); } inlinkInfo->idIndex.push_back(allIds.size()); - numSeqs_.push_back(numSeqs); - if (hasSubseq) { - inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size()); - } - } - if (hasSubseq) { - // inFrameLine create sequenceStartPositions one time - CHECK_EQ( - sequenceStartPositions.size(), - static_cast(maxSequenceLength_ + input.getNumSubSequences())); - CHECK_EQ(inlinkInfo->seqStartPosIndex.size(), - static_cast(maxSequenceLength_ + 1)); - createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions); + inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size()); } + // inFrameLine create sequenceStartPositions one time + CHECK_EQ( + sequenceStartPositions.size(), + static_cast(maxSequenceLength_ + input.getNumSubSequences())); + CHECK_EQ(inlinkInfo->seqStartPosIndex.size(), + static_cast(maxSequenceLength_ + 1)); + createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions); // copy and check scatterId copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize()); @@ -717,11 +867,11 @@ void RecurrentGradientMachine::createMemoryFrameInfo( const Argument& input = (*memoryFrameLine).rootLayer->getOutput(); size_t numSequences = input.getNumSequences(); std::vector allIds; - bool seqFlag = (*memoryFrameLine).is_sequence; + bool seqFlag = input.hasSeq(); + CHECK(!input.hasSubseq()) + << "Subsequence boot layer for memory is not supported"; if (seqFlag) { // for sequenceScatterAgentLayer - CHECK(input.sequenceStartPositions) - << "boot layer must be a sequence when is_sequence = true"; std::vector sequenceStartPositions; sequenceStartPositions.push_back(0); // first element = 0 const int* starts = input.sequenceStartPositions->getData(false); @@ -804,8 +954,7 @@ size_t RecurrentGradientMachine::getGenBatchSize() { for (auto& memoryFrameLine : memoryFrameLines_) { if (!memoryFrameLine.rootLayer) continue; Argument& bootArg = memoryFrameLine.rootLayer->getOutput(); - size_t batchSize = memoryFrameLine.is_sequence ? bootArg.getNumSequences() - : bootArg.getBatchSize(); + size_t batchSize = bootArg.getNumSequences(); if (numSequences) { CHECK_EQ(numSequences, batchSize); } else { @@ -845,12 +994,7 @@ void RecurrentGradientMachine::generateSequence() { if (memoryFrameLine.rootAgent) { auto scatterAgent = dynamic_cast(memoryFrameLine.rootAgent.get()); - bool seqFlag = memoryFrameLine.is_sequence; - scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids, seqFlag); - if (seqFlag) { - CHECK(memoryFrameLine.rootLayer->getOutput().sequenceStartPositions) - << "boot layer must be a sequence when is_sequence = true"; - } + scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids); } NeuralNetwork::connect( memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size()); @@ -858,6 +1002,7 @@ void RecurrentGradientMachine::generateSequence() { // boot layer forward AsyncGpuBlock asyncGpuBlock; + for (auto& memoryFrameLine : memoryFrameLines_) { memoryFrameLine.bootLayer->forward(PASS_TEST); } @@ -930,8 +1075,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { auto scatterAgent = dynamic_cast( memoryFrameLine.scatterAgents[machineCur].get()); scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev], - scatterIds, - memoryFrameLine.is_sequence); + scatterIds); scatterAgent->forward(PASS_TEST); NeuralNetwork::connect(memoryFrameLine.agents[machineCur], memoryFrameLine.scatterAgents[machineCur]); @@ -1003,8 +1147,7 @@ void RecurrentGradientMachine::connectPrevFrame(int stepId, auto scatterAgent = dynamic_cast( memoryFrameLine.scatterAgents[machineCur].get()); scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev], - isOutIds ? topIds_ : machineIds_, - memoryFrameLine.is_sequence); + isOutIds ? topIds_ : machineIds_); scatterAgent->forward(PASS_TEST); NeuralNetwork::connect(memoryFrameLine.agents[machineCur], memoryFrameLine.scatterAgents[machineCur]); diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index c2bc52709ab42bbe21dcc3951f23f2e0b5e6793d..8d94d7e2df216c4657d759c16dd6b1f2848996e0 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -284,6 +284,16 @@ public: } protected: + std::vector commonSeqInfo_; + ICpuGpuVectorPtr sequenceStartPositions_; + void calcSequenceStartPositions(); + void checkInputConsistency(int inlinkId, + const std::vector& seqInfo); + void reorganizeInput(PassType passType); + void reorganizeOutput(PassType passType); + void connectFrames(PassType passType); + void calcNumSequencesAtEachStep(); + void resizeOrCreateFrames(int numFrames); void resizeBootFrame(int numSequences); @@ -295,8 +305,7 @@ protected: std::string linkName; LayerPtr inLayer; std::vector agents; // Scatter Agents to reform batch input - bool hasSubseq; - Argument outArg; // scatter output argument + Argument outArg; // scatter output argument }; std::vector inFrameLines_; @@ -318,7 +327,6 @@ protected: std::vector agents; std::vector scatterAgents; // scatter agent used by beam search Argument outArg; // scatter output argument - bool is_sequence; // Different memoryFrameLine have different element as follows IVectorPtr allIds; // scattered id of realLayer ICpuGpuVectorPtr @@ -330,22 +338,27 @@ protected: // and all outFrameLines(outlinks) share the info with one inFrameLine, // which is assigned by targetInfoInlinkId_. struct Info { - IVectorPtr allIds; // scattered id of realLayer - std::vector idIndex; // index of allIds + // The original positions in the original batch + IVectorPtr allIds; // scattered id of realLayer [batchSize] + + // index of allIds for each step [maxSequenceLength_] + // idIndex[i] is the total length of the first i sequences + std::vector idIndex; + ICpuGpuVectorPtr sequenceStartPositions; // scattered sequenceStartPositions std::vector seqStartPosIndex; // index of sequenceStartPositions }; - std::vector info_; + std::vector info_; // for input // numSeqs_[i] is the number sequences which is longer than i (for sequence // data) or has more than i subsequences (for subsequence data) + // Equivalently, numSeqs_[i] is the number of sequences at step i; std::vector numSeqs_; std::vector> seqInfos_; - // the id of inlink which share info with outlinks - int targetInfoInlinkId_; + void checkOutputConsistency(OutFrameLine& outFrameLine); /* create scattered id infomation for all realLayer of inFrameLines one time. * If hasSubseq, will also create scattered sequenceStartPositions infomation @@ -354,6 +367,28 @@ protected: void createInFrameInfo(int inlinks_id, const Argument& input, PassType passType); + void createInFrameInfo_nonseq(int inlinks_id, + const Argument& input, + PassType passType); + void createInFrameInfo_seq(int inlinks_id, + const Argument& input, + PassType passType); + void createInFrameInfo_subseq(int inlinks_id, + const Argument& input, + PassType passType); + + void createOutFrameInfo(OutFrameLine& outFrameLine, + Info& info, + ICpuGpuVectorPtr& sequenceStartPositions, + ICpuGpuVectorPtr& subSequenceStartPositions); + void createOutFrameInfo_seq(OutFrameLine& outFrameLine, + Info& info, + ICpuGpuVectorPtr& sequenceStartPositions, + ICpuGpuVectorPtr& subSequenceStartPositions); + void createOutFrameInfo_subseq(OutFrameLine& outFrameLine, + Info& info, + ICpuGpuVectorPtr& sequenceStartPositions, + ICpuGpuVectorPtr& subSequenceStartPositions); void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine, PassType passType); @@ -386,9 +421,7 @@ protected: NeuralNetwork* rootNetwork_; bool reversed_; - // if hasSubseq: max number of sentences(subseq)in batchsize samples - // else: max number of tokens in batchsize samples(sentences) - int maxSequenceLength_; + int maxSequenceLength_; // Max top-level length bool useGpu_; bool stopBeamSearch_; diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 7b1b99b135e35e5fe41dbb3d053a96e3e31e5cf1..31463823b3fc04cc24068d95887a9d3ed25a6168 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -36,14 +36,23 @@ void AgentLayer::forward(PassType passType) { Layer::forward(passType); Argument& realOutput = realLayer_->getOutput(); - int realHeight = realOutput.getBatchSize(); - CHECK_LE(numSamples_, realHeight); + int realNumSequences = realOutput.getNumSequences(); + CHECK_LE(numSamples_, realNumSequences); // get Arguments from real layers - if (numSamples_ > 0 && numSamples_ < realHeight) { - if (realOutput.ids) { - output_.ids = - IVector::create(realOutput.ids->getData(), numSamples_, useGpu_); + if (numSamples_ > 0 && numSamples_ < realNumSequences) { + if (realOutput.hasSeq()) { + int numRows = + realOutput.sequenceStartPositions->getData(false)[numSamples_]; + output_.subArgFrom(realOutput, + /* offset */ 0, + numRows, + getSize(), + useGpu_, + /* trans */ false, + /* seqFlag */ true, + /* seqStart */ 0, + /* seqSize */ numSamples_ + 1); } else { output_.subArgFrom( realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_); @@ -53,34 +62,6 @@ void AgentLayer::forward(PassType passType) { } } -void SequenceAgentLayer::forward(PassType passType) { - Layer::forward(passType); - - Argument& realOutput = realLayer_->getOutput(); - int realNumSequences = realOutput.getNumSequences(); - CHECK_LE(numSamples_, realNumSequences); - - // get Arguments from real layers - if (numSamples_ > 0 && numSamples_ < realNumSequences) { - int numRows = - realOutput.sequenceStartPositions->getData(false)[numSamples_]; - CHECK(!realOutput.ids) << "Not supported"; - output_.subArgFrom(realOutput, - /* offset */ 0, - numRows, - getSize(), - useGpu_, - /* trans */ false, - /* seqFlag */ true, - /* seqStart */ 0, - /* seqSize */ numSamples_ + 1); - } else { - output_ = realOutput; - } -} - -REGISTER_LAYER(sequence_agent, SequenceAgentLayer); - bool GatherAgentLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { CHECK_EQ(config_.inputs_size(), 0); @@ -91,18 +72,26 @@ bool GatherAgentLayer::init(const LayerMap& layerMap, return true; } -void GatherAgentLayer::copyIdAndSequenceInfo(const Argument& input, - const IVectorPtr& ids, - const std::vector& idIndex) { - output_.sequenceStartPositions = input.sequenceStartPositions; - output_.subSequenceStartPositions = input.subSequenceStartPositions; - realLayers_.clear(); +void GatherAgentLayer::copyIdAndSequenceInfo( + ICpuGpuVectorPtr sequenceStartPositions, + ICpuGpuVectorPtr subSequenceStartPositions, + const IVectorPtr& ids, + const std::vector& idIndex) { + output_.sequenceStartPositions = sequenceStartPositions; + output_.subSequenceStartPositions = subSequenceStartPositions; allIds_ = ids; idIndex_ = idIndex; } void GatherAgentLayer::forward(PassType passType) { Layer::forward(passType); + forwardIds(passType); + forwardValue(passType); +} + +void GatherAgentLayer::forwardValue(PassType passType) { + MatrixPtr valueReal = realLayers_[0]->getOutputValue(); + if (!valueReal) return; int height = allIds_->getSize(); int width = this->getSize(); @@ -147,7 +136,9 @@ void ScatterAgentLayer::forward(PassType passType) { CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); int width = this->getSize(); - if (realOutArg_.value || realOutArg_.ids) { + if (realOutArg_.hasSeq()) { + forwardSequence(passType); + } else if (realOutArg_.value || realOutArg_.ids) { output_.subArgFrom( realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_); } else { // used in generation @@ -174,7 +165,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) { if (realGrad) { // for agent in inFrameLines and memoryFrameLines, // only first scatterAgentLayer should do addToRows in backward - if (idIndex_ == 0) { + if (handleBackward_) { outputGrad->addToRows(*realGrad, *ids_); } } @@ -183,12 +174,14 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) { REGISTER_LAYER(gather_agent, GatherAgentLayer); REGISTER_LAYER(scatter_agent, ScatterAgentLayer); -void SequenceGatherAgentLayer::forward(PassType passType) { - Layer::forward(passType); +void GatherAgentLayer::forwardIds(PassType passType) { int height = 0; - int* starts = output_.subSequenceStartPositions->getMutableData(false); IVectorPtr idReal = realLayers_[0]->getOutputLabel(); - if (idReal) { + + if (!idReal) return; + + if (output_.subSequenceStartPositions) { + int* starts = output_.subSequenceStartPositions->getMutableData(false); // Gather generator.idsVec // if is beam search generation result. Get first result. if (idReal->getData()[idReal->getSize() - 1] == -1) { @@ -212,13 +205,11 @@ void SequenceGatherAgentLayer::forward(PassType passType) { ->copyFrom(*realLayers_[i]->getOutputLabel()); } } else { - // Gather output.value, same as GatherAgentLayer - CHECK(output_.subSequenceStartPositions); - GatherAgentLayer::forward(passType); + LOG(FATAL) << "Not implemented"; } } -void SequenceScatterAgentLayer::forward(PassType passType) { +void ScatterAgentLayer::forwardSequence(PassType passType) { Layer::forward(passType); CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId()); @@ -241,6 +232,7 @@ void SequenceScatterAgentLayer::forward(PassType passType) { /* seqStart */ seqStartPosIndex_, /* seqSize */ numSequences_); } else { + // Putting the generation logic here is really an ugly hack! // used in generation int height = 0; size_t numSequences = ids_->getSize(); @@ -284,7 +276,4 @@ void SequenceScatterAgentLayer::forward(PassType passType) { } } -REGISTER_LAYER(sequence_gather_agent, SequenceGatherAgentLayer); -REGISTER_LAYER(sequence_scatter_agent, SequenceScatterAgentLayer); - } // namespace paddle diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h index b6dac7ae6fec2d61c60c9548d466233efe9febd5..461b84b17e556b53e0734bff8e37a0d529a3290e 100644 --- a/paddle/gserver/layers/AgentLayer.h +++ b/paddle/gserver/layers/AgentLayer.h @@ -49,18 +49,6 @@ public: void backward(const UpdateCallback& callback = nullptr) override {} }; -/** - * like AgentLayer, but use first *numSamples* sequences - */ -class SequenceAgentLayer : public AgentLayer { -public: - explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {} - ~SequenceAgentLayer() {} - - void forward(PassType passType) override; - void backward(const UpdateCallback& callback = nullptr) override {} -}; - /** * Like AgentLayer, but it can gather many real layers. Each real * layer give a few rows of a sequence, after gather all real layers, @@ -83,7 +71,10 @@ public: const ParameterMap& parameterMap) override; // call before addRealLayer - void copyIdAndSequenceInfo(const Argument& input, + void clearRealLayers() { realLayers_.clear(); } + + void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions, + ICpuGpuVectorPtr subSequenceStartPositions, const IVectorPtr& allIds, const std::vector& idIndex); @@ -92,24 +83,8 @@ public: void forward(PassType passType) override; void backward(const UpdateCallback& callback) override; -}; - -/** - * Like GatherAgentLayer, but select a few sequence in real layer. - * *ids* in addRealLayer() are the ids of selected sequence. - * It's used to reorder sequence output. - */ -class SequenceGatherAgentLayer : public GatherAgentLayer { -public: - explicit SequenceGatherAgentLayer(const LayerConfig& config) - : GatherAgentLayer(config) {} - virtual ~SequenceGatherAgentLayer() {} - - void forward(PassType passType); - void backward(const UpdateCallback& callback) { - // same as GatherAgentLayer - GatherAgentLayer::backward(callback); - } + void forwardValue(PassType passType); + void forwardIds(PassType passType); }; /** @@ -129,6 +104,11 @@ protected: int idSize_; int seqStartPosIndex_; int numSequences_; // number of sequences in this scatterAgentLayer + bool handleBackward_; + + // use to store expanded cpuStartPositions or subSequenceStartPositions + // of real layer. + ICpuGpuVectorPtr inputStartPos_; public: explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {} @@ -147,19 +127,15 @@ public: * false(default) in ScatterAgentLayer, and * true in SequenceScatterAgentLayer. */ - void setRealLayer(LayerPtr layer, - const std::vector& ids, - bool copyId = false) { + void setRealLayer(LayerPtr layer, const std::vector& ids) { realLayer_ = layer; IVector::resizeOrCreate(ids_, ids.size(), useGpu_); ids_->copyFrom(ids.data(), ids.size()); - if (copyId) { - if (useGpu_) { - IVector::resizeOrCreate(cpuIds_, ids.size(), false); - cpuIds_->copyFrom(ids.data(), ids.size()); - } else { - cpuIds_ = ids_; - } + if (useGpu_) { + IVector::resizeOrCreate(cpuIds_, ids.size(), false); + cpuIds_->copyFrom(ids.data(), ids.size()); + } else { + cpuIds_ = ids_; } } @@ -169,12 +145,14 @@ public: const Argument& outArg, const IVectorPtr& ids, int idIndex, - int idSize) { + int idSize, + bool handleBackward) { realLayer_ = layer; realOutArg_ = outArg; ids_ = ids; idIndex_ = idIndex; idSize_ = idSize; + handleBackward_ = handleBackward; } void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions, @@ -187,28 +165,8 @@ public: void forward(PassType passType) override; void backward(const UpdateCallback& callback) override; -}; -/** - * Like ScatterAgentLayer, but select a few sequence in real layer. - * *ids* in setRealLayer() or setRealLayerAndOutput() are the ids of - * selected sequence. It's used to reorder sequence input. - */ -class SequenceScatterAgentLayer : public ScatterAgentLayer { -protected: - // use to store expanded cpuStartPositions or subSequenceStartPositions - // of real layer. - ICpuGpuVectorPtr inputStartPos_; - -public: - explicit SequenceScatterAgentLayer(const LayerConfig& config) - : ScatterAgentLayer(config) {} - virtual ~SequenceScatterAgentLayer() {} - - void forward(PassType passType); - void backward(const UpdateCallback& callback) { - ScatterAgentLayer::backward(callback); - } + void forwardSequence(PassType passType); }; } // namespace paddle diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp index 235d9a9b0f0653df5c0b671092df9e195f08fc48..4179a9e7e0cb58fcb49bff712e62b9f3fea373bd 100644 --- a/paddle/gserver/layers/SequencePoolLayer.cpp +++ b/paddle/gserver/layers/SequencePoolLayer.cpp @@ -46,6 +46,9 @@ void SequencePoolLayer::forward(PassType passType) { Layer::forward(passType); const Argument& input = getInput(0); + CHECK(input.hasSeq() || input.hasSubseq()) + << "Input should be a sequence or subsequence for layer " << getName(); + newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences(); size_t dim = getSize(); // check diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py index 3afd45c72f4dd071ddca569caac8716fe102299b..913365a5a4037d14fcba1e1546508ba89668e0d6 100644 --- a/paddle/gserver/tests/rnn_data_provider.py +++ b/paddle/gserver/tests/rnn_data_provider.py @@ -95,3 +95,22 @@ def process_unequalength_seq(settings, file_name): words1 = reduce(lambda x, y: x + y, d[0]) words2 = reduce(lambda x, y: x + y, d[1]) yield words1, words2, d[2] + + +########################################################### +data3 = [ + [[[1, 2], [4, 5, 2]], [1, 2], 0], + [[[0, 2], [2, 5], [0, 1, 2]], [2, 3, 0], 1], +] + + +# Used for sequence_nest_mixed_inputs.conf +@provider( + input_types=[ + integer_value_sub_sequence(10), integer_value_sequence(10), + integer_value(2) + ], + should_shuffle=False) +def process_mixed(settings, file_name): + for d in data3: + yield d diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf index ad14a2c927c89c9b480af5ad565c37e8b2e54469..afdacfffd7aecfe2f4762f04a987126381bcea34 100644 --- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf +++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf @@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import * define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', test_list=None, module='rnn_data_provider', - obj='process_subseq2') + obj='process_subseq') settings(batch_size=2, learning_rate=0.01) @@ -57,7 +57,7 @@ def outer_step(wid, x): last = last_seq(input=inner_rnn_output, name="outer_rnn_state") # "return last" should also work. But currently RecurrentGradientMachine - # does not handle it, and will report error: In hierachical RNN, all out + # does not handle it, and will report error: In hierachical RNN, all out # links should be from sequences now. return inner_rnn_output diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py new file mode 100644 index 0000000000000000000000000000000000000000..e2635b4400b13517bac716a5a0affeb16c218b09 --- /dev/null +++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py @@ -0,0 +1,85 @@ +# edit-mode: -*- python -*- +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +define_py_data_sources2( + train_list='gserver/tests/Sequence/dummy.list', + test_list=None, + module='rnn_data_provider', + obj='process_mixed') + +settings(batch_size=2, learning_rate=0.01) +######################## network configure ################################ +dict_dim = 10 +word_dim = 2 +hidden_dim = 2 +label_dim = 2 + +data1 = data_layer(name="word1", size=dict_dim) +data2 = data_layer(name="word2", size=dict_dim) +label = data_layer(name="label", size=label_dim) + +encoding = embedding_layer(input=data2, size=word_dim) + +subseq = embedding_layer(input=data1, size=word_dim) +seq = embedding_layer(input=data2, size=word_dim) +nonseq = embedding_layer(input=label, size=word_dim) + + +# This hierarchical RNN is designed to be equivalent to the simple RNN in +# sequence_rnn_multi_unequalength_inputs.conf +def outer_step(subseq, seq, nonseq, encoding): + outer_mem = memory(name="outer_rnn_state", size=hidden_dim) + + def inner_step(subseq, seq, nonseq): + inner_mem = memory( + name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem) + + out = fc_layer( + input=[subseq, seq, nonseq, inner_mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name='inner_rnn_state') + return out + + decoder = recurrent_group( + step=inner_step, name='inner', input=[subseq, seq, nonseq]) + last = last_seq(name="outer_rnn_state", input=decoder) + context = simple_attention( + encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last) + return context + + +out = recurrent_group( + name="outer", + step=outer_step, + input=[ + subseq, expand_layer( + seq, expand_as=subseq, + expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer( + nonseq, + expand_as=subseq, + expand_level=ExpandLevel.FROM_NO_SEQUENCE), + StaticInput(encoding) + ]) + +rep = last_seq(input=out) +prob = fc_layer( + size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True) + +outputs(classification_cost(input=prob, label=label)) diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py new file mode 100644 index 0000000000000000000000000000000000000000..a6f2d419f25aebd46c340aa71a5f760a4a86c99f --- /dev/null +++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py @@ -0,0 +1,78 @@ +# edit-mode: -*- python -*- +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +define_py_data_sources2( + train_list='gserver/tests/Sequence/dummy.list', + test_list=None, + module='rnn_data_provider', + obj='process_mixed') + +settings(batch_size=2, learning_rate=0.01) +######################## network configure ################################ +dict_dim = 10 +word_dim = 2 +hidden_dim = 2 +label_dim = 2 + +data1 = data_layer(name="word1", size=dict_dim) +data2 = data_layer(name="word2", size=dict_dim) +label = data_layer(name="label", size=label_dim) + +encoding = embedding_layer(input=data2, size=word_dim) + + +# This hierarchical RNN is designed to be equivalent to the simple RNN in +# sequence_rnn_multi_unequalength_inputs.conf +def outer_step(subseq, seq, nonseq, encoding): + outer_mem = memory(name="outer_rnn_state", size=hidden_dim) + + def inner_step(data1, data2, label): + inner_mem = memory( + name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem) + + subseq = embedding_layer(input=data1, size=word_dim) + seq = embedding_layer(input=data2, size=word_dim) + nonseq = embedding_layer(input=label, size=word_dim) + + print_layer(input=[data1, seq, label, inner_mem]) + out = fc_layer( + input=[subseq, seq, nonseq, inner_mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name='inner_rnn_state') + return out + + decoder = recurrent_group( + step=inner_step, name='inner', input=[subseq, seq, nonseq]) + last = last_seq(name="outer_rnn_state", input=decoder) + context = simple_attention( + encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last) + return context + + +out = recurrent_group( + name="outer", + step=outer_step, + input=[data1, data2, label, StaticInput(encoding)]) + +rep = last_seq(input=out) +prob = fc_layer( + size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True) + +outputs(classification_cost(input=prob, label=label)) diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf index 40d031741573251aa94d2a0f355470c53c51de7e..9fae974f3079c49ad03d6ba34e30190f325414e8 100644 --- a/paddle/gserver/tests/sequence_rnn_multi_input.conf +++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf @@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import * define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', test_list=None, module='rnn_data_provider', - obj='process_seq2') + obj='process_seq') settings(batch_size=2, learning_rate=0.01) diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp index 4a846397e6cf3100f948af46874b0739e32bf4a5..6b19eb0ce520a625ac68582d5c1e11c168127dc7 100644 --- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp +++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp @@ -155,6 +155,15 @@ TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) { } } +TEST(RecurrentGradientMachine, rnn_mixed_input) { + for (bool useGpu : {false, true}) { + test("gserver/tests/sequence_rnn_mixed_inputs.py", + "gserver/tests/sequence_rnn_matched_inputs.py", + 1e-6, + useGpu); + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp index eaa1cdce305c2f9d7a517e9e8c8606dc1f70780b..c519ca500afb1dbfdff6e8d211786f4e18ccf1fd 100644 --- a/paddle/math/Vector.cpp +++ b/paddle/math/Vector.cpp @@ -908,12 +908,13 @@ const T* CpuGpuVectorT::getData(bool useGpu) const { // Operation will change data and need to reset sync_ & syncFlag_. #define MUTABLE_VECTOR_OP(OP, useGpu, args...) \ do { \ - setSync(useGpu); \ if (useGpu) { \ copyToGpu(); \ + setSync(useGpu); \ return gpuVectorT_->OP(args); \ } else { \ copyToCpu(); \ + setSync(useGpu); \ return cpuVectorT_->OP(args); \ } \ } while (0) @@ -1030,7 +1031,7 @@ void CpuGpuVectorT::copyToCpu() { case DATA_AT_GPU: CHECK(gpuVectorT_); this->resizeOrCreate(gpuVectorT_->getSize(), false); - cpuVectorT_->copyFrom(*gpuVectorT_, HPPL_STREAM_DEFAULT); + cpuVectorT_->copyFrom(*gpuVectorT_); setSync(SYNCED); break; case DATA_AT_CPU: @@ -1049,7 +1050,7 @@ void CpuGpuVectorT::copyToGpu() { case DATA_AT_CPU: CHECK(cpuVectorT_); this->resizeOrCreate(cpuVectorT_->getSize(), true); - gpuVectorT_->copyFrom(*cpuVectorT_, HPPL_STREAM_DEFAULT); + gpuVectorT_->copyFrom(*cpuVectorT_); setSync(SYNCED); break; case DATA_AT_GPU: diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 91aca98e186aef0ad6b345cf4791ef80c616e3fe..09bd633616730dc9475edc596128166f4f70b0cd 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -149,6 +149,7 @@ struct Argument { : getBatchSize(); } + bool hasSeq() const { return sequenceStartPositions != nullptr; } bool hasSubseq() const { return subSequenceStartPositions != nullptr; } const int* getCpuStartPositions() const { diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp index 03446b3b2f6d5ff42fbf0d735a24d88bd0429747..1322e77178a4f5674f41943f886a17be8337bd75 100644 --- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp +++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp @@ -124,6 +124,8 @@ TEST(RecurrentGradientMachine, test_generation) { bool beam_search) { FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0"; for (auto useGpu : useGpuConfs) { + LOG(INFO) << configFile << " useGpu=" << useGpu + << " beam_search=" << beam_search; testGeneration(configFile, useGpu, hasSubseq, expRetFile); } }; diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index fc2e3bbcde0e94b6325bd0ca1fd41e088df0b950..8edb61d1840d46edebe72a89ef2725a9d84841b9 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -333,48 +333,32 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name, for linkid, link in enumerate(in_links): if isinstance(link, basestring): name = link - has_subseq = False else: name = link.link_name - has_subseq = link.has_subseq # assign target_inlinkid according to target_inlinkname if target_inlinkname == name: g_current_submodel.target_inlinkid = linkid - if in_links_count == 0: - in_links_has_subseq = has_subseq - else: - config_assert( - in_links_has_subseq == has_subseq, - "The sequence type of in_links should be the same in RecurrentLayerGroup" - ) in_links_count += 1 layer_name = MakeLayerNameInParentSubmodel(name) layer = g_layer_map[layer_name] - if has_subseq: - SequenceScatterAgentLayer(name=name, size=layer.size) - else: - ScatterAgentLayer(name=name, size=layer.size) + ScatterAgentLayer(name=name, size=layer.size) pair = g_current_submodel.in_links.add() pair.layer_name = layer_name pair.link_name = MakeLayerNameInSubmodel(name) - pair.has_subseq = has_subseq @config_func def RecurrentLayerGroupSetOutLink(link): if isinstance(link, basestring): name = link - has_subseq = False else: name = link.link_name - has_subseq = link.has_subseq layer_name = MakeLayerNameInParentSubmodel(name) pair = g_current_submodel.out_links.add() pair.layer_name = MakeLayerNameInSubmodel(name) pair.link_name = layer_name - pair.has_subseq = has_subseq def RecurrentLayerGroupSetGenerator(generator=None): @@ -425,8 +409,6 @@ def RecurrentLayerGroupEnd(name): agent_name = GetLayerBaseName(pair.link_name) if prev_submodel.HasField("generator"): DataLayer(name=agent_name, size=layer.size) - elif pair.has_subseq: - SequenceGatherAgentLayer(name=agent_name, size=layer.size) else: GatherAgentLayer(name=agent_name, size=layer.size) @@ -2253,13 +2235,6 @@ class AgentLayer(LayerBase): name, 'agent', size, inputs=[], device=device) -@config_layer('sequence_agent') -class SequenceAgentLayer(LayerBase): - def __init__(self, name, size, device=None): - super(SequenceAgentLayer, self).__init__( - name, 'sequence_agent', size, inputs=[], device=device) - - @config_layer('gather_agent') class GatherAgentLayer(LayerBase): def __init__(self, name, size, device=None): @@ -2274,20 +2249,6 @@ class ScatterAgentLayer(LayerBase): name, 'scatter_agent', size, inputs=[], device=device) -@config_layer('sequence_gather_agent') -class SequenceGatherAgentLayer(LayerBase): - def __init__(self, name, size, device=None): - super(SequenceGatherAgentLayer, self).__init__( - name, 'sequence_gather_agent', size, inputs=[], device=device) - - -@config_layer('sequence_scatter_agent') -class SequenceScatterAgentLayer(LayerBase): - def __init__(self, name, size, device=None): - super(SequenceScatterAgentLayer, self).__init__( - name, 'sequence_scatter_agent', size, inputs=[], device=device) - - @config_layer('multiplex') class MultiplexLayer(LayerBase): def __init__(self, name, inputs, size, device=None): @@ -2303,12 +2264,12 @@ class MultiplexLayer(LayerBase): @config_func -def Link( - name, - has_subseq=False, ): +def Link(name, has_subseq=False): + """ + Still keeping has_subseq for backward compatibility + """ link_config = LinkConfig() link_config.link_name = name - link_config.has_subseq = has_subseq return link_config @@ -2341,13 +2302,7 @@ def Memory(name, config_assert(name is not None, "name needs cannot be None") memory_name = name + "+delay1" agent_name = memory_name - if is_sequence: - config_assert( - boot_layer is not None, - "there must be boot_layer in network when is_sequence = True") - agent_layer = SequenceAgentLayer(agent_name, size) - else: - agent_layer = AgentLayer(agent_name, size) + agent_layer = AgentLayer(agent_name, size) config_assert(g_current_submodel.is_recurrent_layer_group, 'Memory should be used in recurrent layer group only') memory = g_current_submodel.memories.add() diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 2d8ddbb9007b241eb1986887d8ea6c2de8235c29..d4fd191b179666454bd6ce4d7d441aa6dc173070 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -3329,8 +3329,9 @@ class StaticInput(object): input.size = size -class SubsequenceInput(object): +def SubsequenceInput(input): """ + DEPRECATED. Input sequence has sub-sequence, used in recurrent_group. The example usage is: @@ -3339,11 +3340,7 @@ class SubsequenceInput(object): input = SubsequenceInput(layer) """ - - def __init__(self, input): - assert isinstance(input, LayerOutput) - assert input.size is not None - self.input = input + return input @wrap_name_default("recurrent_group") @@ -3407,7 +3404,8 @@ def recurrent_group(step, input sequence in a reverse order. :type reverse: bool - :param targetInlink: the input layer which share info with layer group's output + :param targetInlink: DEPRECATED. + The input layer which share info with layer group's output Param input specifies multiple input layers. For SubsequenceInput inputs, config should assign one input @@ -3429,46 +3427,21 @@ def recurrent_group(step, model_type('recurrent_nn') def is_single_input(x): - return isinstance(x, LayerOutput) or isinstance(x, StaticInput) \ - or isinstance(x, SubsequenceInput) + return isinstance(x, LayerOutput) or isinstance(x, StaticInput) if is_single_input(input): input = [input] assert isinstance(input, collections.Sequence) def is_in_links(x): - return isinstance(x, LayerOutput) or isinstance(x, SubsequenceInput) + return isinstance(x, LayerOutput) in_links = filter(is_in_links, input) - def targetInlink_in_inlinks(): - for inlink in in_links: - if isinstance(inlink, SubsequenceInput): - if targetInlink == inlink.input: - return True - elif targetInlink == inlink: - return True - return False - - assert (targetInlink == None or targetInlink_in_inlinks()) - targetInlinkName = None if targetInlink == None \ - else targetInlink.name if isinstance(targetInlink, LayerOutput) \ - else targetInlink.input.name - - contains_sub_seq = [False] - - def map_in_links(x): - if isinstance(x, SubsequenceInput): - contains_sub_seq[0] = True - return Link(name=x.input.name, has_subseq=True) - else: - return x.name - RecurrentLayerGroupWithoutOutLinksBegin( name=name, - in_links=map(map_in_links, in_links), - seq_reversed=reverse, - target_inlinkname=targetInlinkName) + in_links=map(lambda x: x.name, in_links), + seq_reversed=reverse) in_args = [] has_LayerOutput = False for each_input in input: @@ -3476,10 +3449,7 @@ def recurrent_group(step, if isinstance(each_input, LayerOutput): in_args.append(each_input) has_LayerOutput = True - elif isinstance(each_input, SubsequenceInput): - in_args.append(each_input.input) - has_LayerOutput = True - else: + else: # StaticInput mem_name = "__%s_memory__" % each_input.input.name mem = memory( name=mem_name, @@ -3503,10 +3473,7 @@ def recurrent_group(step, for ot in layer_outs: assert isinstance(ot, LayerOutput) ot.reverse = reverse - if contains_sub_seq[0]: - RecurrentLayerGroupSetOutLink(Link(ot.name, has_subseq=True)) - else: - RecurrentLayerGroupSetOutLink(ot.name) + RecurrentLayerGroupSetOutLink(ot.name) RecurrentLayerGroupEnd(name=name) @@ -5608,13 +5575,13 @@ def row_conv_layer(input, to deploy in an online and low-latency setting. The lookahead convolution incorporates information from future subsequences in a computationally efficient manner to improve unidirectional recurrent neural networks. - + The connection of row convolution is different form the 1D sequence convolution. Assumed that, the future context-length is k, that is to say, it can get the output at timestep t by using the the input feature from t-th timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input activations are d, the activations r_t for the new layer at time-step t are: - + .. math:: r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}} diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr index 64530146a1458933d4ba0edffc1b1b7e60a21187..49bc5a7779151ec8aa278d4a9697dd4c7f311aca 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr @@ -261,12 +261,10 @@ sub_models { in_links { layer_name: "__simple_gru_0___transform" link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group" - has_subseq: false } out_links { layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group" link_name: "__simple_gru_0__" - has_subseq: false } target_inlinkid: -1 } @@ -285,12 +283,10 @@ sub_models { in_links { layer_name: "__simple_gru_1___transform" link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group" - has_subseq: false } out_links { layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group" link_name: "__simple_gru_1__" - has_subseq: false } target_inlinkid: -1 } diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr index 79fa4c74f081aebadd258e06333de9eafe6a5ee3..f156c17fc9988cbd0f4e2853d5fe143f6480d860 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr @@ -351,12 +351,10 @@ sub_models { in_links { layer_name: "__mixed_0__" link_name: "__mixed_0__@__lstm_group_0___recurrent_group" - has_subseq: false } out_links { layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group" link_name: "__lstm_group_0__" - has_subseq: false } target_inlinkid: -1 } @@ -383,12 +381,10 @@ sub_models { in_links { layer_name: "__mixed_1__" link_name: "__mixed_1__@__lstm_group_1___recurrent_group" - has_subseq: false } out_links { layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group" link_name: "__lstm_group_1__" - has_subseq: false } target_inlinkid: -1 } diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr index 77b447aa9db2a6c323fd3c322e7e9ca1ed19a6dd..6ec897f7d0931133b0cc517b0b65b775f4b2d14a 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr @@ -155,7 +155,7 @@ layers { } layers { name: "sub_seq_input@__recurrent_group_2__" - type: "sequence_scatter_agent" + type: "scatter_agent" size: 100 active_type: "" } @@ -182,7 +182,7 @@ layers { } layers { name: "rnn_subseq_forward" - type: "sequence_gather_agent" + type: "gather_agent" size: 200 active_type: "" } @@ -623,12 +623,10 @@ sub_models { in_links { layer_name: "seq_input" link_name: "seq_input@__recurrent_group_0__" - has_subseq: false } out_links { layer_name: "rnn_forward@__recurrent_group_0__" link_name: "rnn_forward" - has_subseq: false } target_inlinkid: -1 } @@ -647,12 +645,10 @@ sub_models { in_links { layer_name: "seq_input" link_name: "seq_input@__recurrent_group_1__" - has_subseq: false } out_links { layer_name: "rnn_back@__recurrent_group_1__" link_name: "rnn_back" - has_subseq: false } target_inlinkid: -1 } @@ -671,12 +667,10 @@ sub_models { in_links { layer_name: "sub_seq_input" link_name: "sub_seq_input@__recurrent_group_2__" - has_subseq: true } out_links { layer_name: "rnn_subseq_forward@__recurrent_group_2__" link_name: "rnn_subseq_forward" - has_subseq: true } target_inlinkid: -1 } @@ -703,12 +697,10 @@ sub_models { in_links { layer_name: "__mixed_0__" link_name: "__mixed_0__@__lstm_group_0___recurrent_group" - has_subseq: false } out_links { layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group" link_name: "__lstm_group_0__" - has_subseq: false } target_inlinkid: -1 } @@ -727,12 +719,10 @@ sub_models { in_links { layer_name: "__mixed_1__" link_name: "__mixed_1__@__gru_group_0___recurrent_group" - has_subseq: false } out_links { layer_name: "__gru_group_0__@__gru_group_0___recurrent_group" link_name: "__gru_group_0__" - has_subseq: false } target_inlinkid: -1 } @@ -751,12 +741,10 @@ sub_models { in_links { layer_name: "seq_input" link_name: "seq_input@__recurrent_group_3__" - has_subseq: false } out_links { layer_name: "__fc_layer_0__@__recurrent_group_3__" link_name: "__fc_layer_0__" - has_subseq: false } target_inlinkid: -1 }