From 699d5f26388d01666692565ec8e8f4599c993208 Mon Sep 17 00:00:00 2001 From: zhangruiqing01 Date: Mon, 12 Sep 2016 15:06:18 +0800 Subject: [PATCH] modify RecurrentGradientMachine to support unequal length inputs * modify RecurrentGradientMachine to support hasSubSeq sequence inlinks with the same number of sentence but different number of tokens for each sentence Change-Id: Ic71f00a4bb346b4fa93e650dfb4b1a0d8d2338b0 --- .../RecurrentGradientMachine.cpp | 262 +++++++++++------- .../RecurrentGradientMachine.h | 43 ++- proto/ModelConfig.proto.m4 | 3 + python/paddle/trainer/config_parser.py | 15 +- 4 files changed, 211 insertions(+), 112 deletions(-) diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 7bc5fe5181..e000bb2e5d 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "paddle/utils/Util.h" #include "paddle/utils/Flags.h" @@ -291,6 +290,8 @@ void RecurrentGradientMachine::init( if (subModelConfig->evaluator_names_size() > 0) { evaluator_.reset(frames_[0]->makeEvaluator()); } + + targetInfoInlinkId_ = subModelConfig->target_inlinkid(); } void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) { @@ -325,7 +326,7 @@ void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) { for (int i = frames_.size(); i < numFrames; ++i) { std::unique_ptr frame( - NeuralNetwork::newNeuralNetwork(subModelName_)); + NeuralNetwork::newNeuralNetwork(subModelName_)); frame->init(config_, subParamInitCb); for (auto& inFrameLine : inFrameLines_) { @@ -382,6 +383,16 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, size_t numSequences = input.getNumSequences(); const int* starts = input.sequenceStartPositions->getData(false); bool hasSubseq = input.hasSubseq(); + + // In case of !hasSubseq or targetInfoInlinkId_ == -1, all inlinks share the + // same inframe info + bool shareInlinkInfo = !hasSubseq || targetInfoInlinkId_ == -1; + + // Defaultly, share info with the first inlink + if (shareInlinkInfo) { + targetInfoInlinkId_ = 0; + } + // check hasSubseq in both config and input are the same CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq); @@ -394,9 +405,17 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, CHECK_EQ((size_t)input1.getNumSequences(), numSequences); // check all inputs should have same hasSubseq flag CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq); - CHECK_EQ(input1.getBatchSize(), batchSize); - CHECK(std::equal(starts, starts + numSequences + 1, - input1.sequenceStartPositions->getData(false))); + + // if shareInlinkInfo, checks: + // 1. all inlinks have same number of total tokens + // 2. all inlinks have same number of tokens for each sentence of each + // sample. If hasSubseq, one sample has multiple sentence, else, one + // sample is one sentence + if (shareInlinkInfo) { + CHECK_EQ(input1.getBatchSize(), batchSize); + CHECK(std::equal(starts, starts + numSequences + 1, + input1.sequenceStartPositions->getData(false))); + } } if (hasSubseq) { @@ -408,19 +427,44 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, for (size_t i = 1; i < inFrameLines_.size(); ++i) { const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences); - CHECK(std::equal(subStarts, subStarts + numSubSequences + 1, - input1.subSequenceStartPositions->getData(false))); + if (shareInlinkInfo) { + CHECK(std::equal(subStarts, subStarts + numSubSequences + 1, + input1.subSequenceStartPositions->getData(false))); + } } } seqLengthAndStart_.clear(); - input.getSeqLengthAndStart(&seqLengthAndStart_, &maxSequenceLength_); + info_.clear(); + info_.resize(inFrameLines_.size()); + seqLengthAndStart_.resize(inFrameLines_.size()); + + { + AsyncGpuBlock asyncGpuBlock; + // if shareInlinkInfo, only calculate info of the first inlink + // else, calculate info for each inlink + if (shareInlinkInfo) { + input.getSeqLengthAndStart(&seqLengthAndStart_[0], &maxSequenceLength_); + createInFrameInfo(0, input, passType); + } else { + for (size_t i = 0; i < inFrameLines_.size(); i++) { + const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); + input1.getSeqLengthAndStart(&seqLengthAndStart_[i], + &maxSequenceLength_); + createInFrameInfo(i, input1, passType); + } + } + + // inFrameLine select rows in real layer one time + for (size_t i = 0; i < inFrameLines_.size(); i++) { + int curInlinkId = shareInlinkInfo ? 0 : i; + selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds, + &(inFrameLines_[i].outArg), passType); + } + } resizeOrCreateFrames(maxSequenceLength_); resizeBootFrame(numSequences); - AsyncGpuBlock asyncGpuBlock; - createInFrameInfo(input, passType); - for (auto& memoryFrameLine : memoryFrameLines_) { if (memoryFrameLine.rootAgent) { auto scatterAgent = @@ -443,23 +487,29 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, auto gatherAgent = dynamic_cast(outFrameLine.agentLayer.get()); CHECK_NOTNULL(gatherAgent); - gatherAgent->copyIdAndSequenceInfo(input, info_.allIds, info_.idIndex); + gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds, + info_[targetInfoInlinkId_].idIndex); } for (int i = 0; i < maxSequenceLength_; ++i) { - int idSize = info_.idIndex[i + 1] - info_.idIndex[i]; - + int idSize = 0; // connect in_links - for (auto& inFrameLine : inFrameLines_) { + for (size_t j = 0; j < inFrameLines_.size(); ++j) { + // idSize denotes the sum number of tokens in each length i + idSize = info_[j].idIndex[i + 1] - info_[j].idIndex[i]; + InFrameLine inFrameLine = inFrameLines_[j]; auto scatterAgent = dynamic_cast(inFrameLine.agents[i].get()); scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer, - inFrameLine.outArg, info_.allIds, - info_.idIndex[i], idSize); + inFrameLine.outArg, info_[j].allIds, + info_[j].idIndex[i], idSize); if (hasSubseq) { - int size = info_.seqStartPosIndex[i + 1] - info_.seqStartPosIndex[i]; - scatterAgent->setSequenceStartPositions( - info_.sequenceStartPositions, info_.seqStartPosIndex[i], size); + // size: the length of subsequence + int size = + info_[j].seqStartPosIndex[i + 1] - info_[j].seqStartPosIndex[i]; + scatterAgent->setSequenceStartPositions(info_[j].sequenceStartPositions, + info_[j].seqStartPosIndex[i], + size); } } @@ -471,6 +521,10 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, } // connect memory links + // Adopt info_[0].idIndex because seq which has_subseq=True + // doesn't support Memory with !hasSubseq bootlayer; + // And inlinks that !hasSubSeq must have same inlink length. + idSize = info_[0].idIndex[i + 1] - info_[0].idIndex[i]; for (auto& memoryFrameLine : memoryFrameLines_) { NeuralNetwork::connect( memoryFrameLine.agents[i], @@ -560,62 +614,68 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() { * If hasSubseq, will also create scattered sequenceStartPositions infomation * for all realLayer of inFrameLines one time. */ -void RecurrentGradientMachine::createInFrameInfo(const Argument& input, + +void RecurrentGradientMachine::createInFrameInfo(int inlinks_id, + const Argument& input, PassType passType) { bool hasSubseq = input.hasSubseq(); + // numSequences: # samples(sequences) in a batch size_t numSequences = input.getNumSequences(); std::vector allIds; - info_.idIndex.clear(); - info_.idIndex.push_back(0); // first idIndex = 0 - if (hasSubseq) { // for sequenceScatterAgentLayer + Info* inlink_info = &info_[inlinks_id]; + inlink_info->idIndex.clear(); + inlink_info->idIndex.push_back(0); // first idIndex = 0 + if (hasSubseq) { // for sequenceScatterAgentLayer + // numSubSequences : all sentences within all samples(batch) size_t numSubSequences = input.getNumSubSequences(); std::vector sequenceStartPositions; - info_.seqStartPosIndex.clear(); - info_.seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 + inlink_info->seqStartPosIndex.clear(); + inlink_info->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 + // maxSequenceLength_: max number of sentences(subseq) in allsamples for (int i = 0; i < maxSequenceLength_; ++i) { - sequenceStartPositions.push_back(0); // first element = 0 - for (size_t j = 0; j < numSubSequences; ++j) { - if (std::get<3>(seqLengthAndStart_[j]) == i) { - int subSeqStart = std::get<1>(seqLengthAndStart_[j]); - int subSeqLength = std::get<0>(seqLengthAndStart_[j]); + sequenceStartPositions.push_back(0); // first element = 0 + for (size_t j = 0; j < numSubSequences; ++j) { // for each sentence + // seqLengthAndStart_[inlinks_id][j]: + // a 4-tuple including + if (std::get<3>(seqLengthAndStart_[inlinks_id][j]) == i) { + // subseqstart: the cpuSubSequenceStartPositions of this subseq + int subSeqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]); + int subSeqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]); for (int k = subSeqStart; k < subSeqStart + subSeqLength; ++k) { allIds.push_back(k); } sequenceStartPositions.push_back(sequenceStartPositions.back() + - subSeqLength); + subSeqLength); } } - info_.idIndex.push_back(allIds.size()); - info_.seqStartPosIndex.push_back(sequenceStartPositions.size()); + inlink_info->idIndex.push_back(allIds.size()); + inlink_info->seqStartPosIndex.push_back(sequenceStartPositions.size()); } // inFrameLine create sequenceStartPositions one time CHECK_EQ(sequenceStartPositions.size(), maxSequenceLength_ + numSubSequences); - CHECK_EQ(info_.seqStartPosIndex.size(), + CHECK_EQ(inlink_info->seqStartPosIndex.size(), static_cast(maxSequenceLength_ + 1)); - createSeqPos(sequenceStartPositions, &info_.sequenceStartPositions); + createSeqPos(sequenceStartPositions, &inlink_info->sequenceStartPositions); } else { // for scatterAgentLayer for (int i = 0; i < maxSequenceLength_; ++i) { for (size_t j = 0; j < numSequences; ++j) { - int seqLength = std::get<0>(seqLengthAndStart_[j]); + int seqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]); if (i >= seqLength) { break; } - int seqStart = std::get<1>(seqLengthAndStart_[j]); + int seqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]); allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i) : (seqStart + i)); } - info_.idIndex.push_back(allIds.size()); + inlink_info->idIndex.push_back(allIds.size()); } } + // copy and check scatterId - copyScattedId(allIds, &info_.allIds, input.getBatchSize()); - CHECK_EQ(info_.idIndex.size(), static_cast(maxSequenceLength_ + 1)); - // inFrameLine select rows in real layer one time - for (auto& inFrameLine : inFrameLines_) { - selectRowsOneTime(inFrameLine.inLayer, info_.allIds, &inFrameLine.outArg, - passType); - } + copyScattedId(allIds, &inlink_info->allIds, input.getBatchSize()); + CHECK_EQ(inlink_info->idIndex.size(), + static_cast(maxSequenceLength_ + 1)); } /* like createInFrameInfo, but for all realLayer of memoryFrameLines*/ @@ -633,19 +693,20 @@ void RecurrentGradientMachine::createMemoryFrameInfo( sequenceStartPositions.push_back(0); // first element = 0 const int* starts = input.sequenceStartPositions->getData(false); for (size_t i = 0; i < numSequences; ++i) { - int seqId = std::get<2>(seqLengthAndStart_[i]); + // memory info adopt info of inlinks[0] + int seqId = std::get<2>(seqLengthAndStart_[0][i]); for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) { allIds.push_back(k); } sequenceStartPositions.push_back(sequenceStartPositions.back() + - starts[seqId + 1] - starts[seqId]); + starts[seqId + 1] - starts[seqId]); } createSeqPos(sequenceStartPositions, &(*memoryFrameLine).sequenceStartPositions); } else { // for scatterAgentLayer for (size_t i = 0; i < numSequences; ++i) { - allIds.push_back(std::get<2>(seqLengthAndStart_[i])); + allIds.push_back(std::get<2>(seqLengthAndStart_[0][i])); } } // copy and check scatterId @@ -699,18 +760,19 @@ size_t RecurrentGradientMachine::getGenBatchSize() { for (auto& memoryFrameLine : memoryFrameLines_) { if (!memoryFrameLine.rootLayer) continue; Argument& bootArg = memoryFrameLine.rootLayer->getOutput(); - size_t batchSize = memoryFrameLine.is_sequence ? - bootArg.getNumSequences() : bootArg.getBatchSize(); + size_t batchSize = memoryFrameLine.is_sequence ? bootArg.getNumSequences() + : bootArg.getBatchSize(); if (numSequences) { CHECK_EQ(numSequences, batchSize); } else { numSequences = batchSize; } } - CHECK(numSequences) << "Fail to get batch size in generation. " - "At least one of the Memory layer MUST have a layer that is NOT in " - "the layer group to boot it, and this boot layer is used to " - "decide batch_size in generation process."; + CHECK(numSequences) + << "Fail to get batch size in generation. " + "At least one of the Memory layer MUST have a layer that is NOT in " + "the layer group to boot it, and this boot layer is used to " + "decide batch_size in generation process."; return numSequences; } @@ -732,7 +794,9 @@ void RecurrentGradientMachine::generateSequence() { // connect boot frame memory links std::vector ids(numSequences); - for (size_t i = 0; i < numSequences; ++i) { ids[i] = i; } + for (size_t i = 0; i < numSequences; ++i) { + ids[i] = i; + } for (auto& memoryFrameLine : memoryFrameLines_) { if (memoryFrameLine.rootAgent) { auto scatterAgent = @@ -756,7 +820,8 @@ void RecurrentGradientMachine::generateSequence() { // init outArg size_t resultNum = generator_.config.num_results_per_sample(); - IVector::resizeOrCreate(generator_.outArg.ids, + IVector::resizeOrCreate( + generator_.outArg.ids, generator_.config.max_num_frames() * numSequences * resultNum, false); if (resultNum > 1) { CHECK_LE(resultNum, static_cast(generator_.config.beam_size())); @@ -847,7 +912,9 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { // path.seqId = -1 indicates end of generation // of an input sequence finalPaths[seqIds_[j]].seqId = -1; - } else { scatterIds.push_back(j); } + } else { + scatterIds.push_back(j); + } } } @@ -856,13 +923,12 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { starts[0] = 0; generator_.ids.clear(); for (size_t i = 0; i < batchSize; ++i) { - generator_.ids.insert(generator_.ids.end(), - finalPaths[i].ids.begin(), + generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(), finalPaths[i].ids.end()); starts[i + 1] = generator_.ids.size(); batchMachineIdVec_.insert(batchMachineIdVec_.end(), - finalPaths[i].machineIdVec.begin(), - finalPaths[i].machineIdVec.end()); + finalPaths[i].machineIdVec.begin(), + finalPaths[i].machineIdVec.end()); } } @@ -920,9 +986,9 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) { } } -void RecurrentGradientMachine::singlePathExpand( - Path& curPath, size_t curPathId, std::vector& newPaths, - size_t expandWidth) { +void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId, + std::vector& newPaths, + size_t expandWidth) { int calc_id = gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0; @@ -946,19 +1012,20 @@ void RecurrentGradientMachine::singlePathExpand( if (id == -1) break; real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob; - Path newPath(curPath, id, newLogProb, - curPathId /*machineId*/, k /*topIndex*/); + Path newPath(curPath, id, newLogProb, curPathId /*machineId*/, + k /*topIndex*/); if (this->beamSearchCtrlCallbacks_) { if (beamSearchCtrlCallbacks_->stopDetermineCandidates( - newPath.seqId, newPath.ids, newPath.probHistory)) return; + newPath.seqId, newPath.ids, newPath.probHistory)) + return; } // outFrameLines_.size() > 1UL if (dataArgsSize_) { newPath.machineIdVec = curPath.machineIdVec; newPath.machineIdVec.push_back(curPathId); } - bool atEos = eosVec[index] == 1U || - newPath.ids.size() >= (size_t)maxSequenceLength_; + bool atEos = + eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_; // adjustNewPath newPath.adjustProb(calc_id, atEos); if (this->beamSearchCtrlCallbacks_) { @@ -966,16 +1033,18 @@ void RecurrentGradientMachine::singlePathExpand( newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb); } if (!newPath.isDropable()) { - atEos ? finalPaths_[curPath.seqId].push_back(newPath) : - newPaths.push_back(newPath); + atEos ? finalPaths_[curPath.seqId].push_back(newPath) + : newPaths.push_back(newPath); } } // for expandWidth - if (gDiyProbStop) { gDiyProbStop(calc_id); } + if (gDiyProbStop) { + gDiyProbStop(calc_id); + } } -void RecurrentGradientMachine::beamExpand( - std::vector& paths, std::vector& newPaths) { +void RecurrentGradientMachine::beamExpand(std::vector& paths, + std::vector& newPaths) { size_t candidatePathCount = paths.size(); // idVec.size() could be larger than candidatePathCount * beam, // so user can drop some node customly. @@ -988,7 +1057,7 @@ void RecurrentGradientMachine::beamExpand( int curSeqId = 0; for (size_t j = 0; j <= candidatePathCount; j++) { // expansions of a single sequence are all processed - curSeqId = (j < candidatePathCount? paths[j].seqId : curSeqId + 1); + curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1); if (prevSeqId != -1 && curSeqId != prevSeqId) { totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount); } @@ -1000,11 +1069,14 @@ void RecurrentGradientMachine::beamExpand( } // Drop extra nodes to beam size. -size_t RecurrentGradientMachine::beamShrink( - std::vector& newPaths, size_t seqId, size_t totalExpandCount) { - size_t minNewPathSize = std::min(getBeamSize(), - newPaths.size() - totalExpandCount); - if (!minNewPathSize) { return 0; } +size_t RecurrentGradientMachine::beamShrink(std::vector& newPaths, + size_t seqId, + size_t totalExpandCount) { + size_t minNewPathSize = + std::min(getBeamSize(), newPaths.size() - totalExpandCount); + if (!minNewPathSize) { + return 0; + } std::nth_element(newPaths.begin() + totalExpandCount, newPaths.begin() + totalExpandCount + minNewPathSize, newPaths.end(), Path::greaterPath); @@ -1017,11 +1089,8 @@ size_t RecurrentGradientMachine::beamShrink( // Remove the already formed paths that are relatively short finalPaths_[seqId].erase( - std::remove_if(finalPaths_[seqId].begin(), - finalPaths_[seqId].end(), - [&](Path& p) { - return p.logProb < minPathLogProb; - }), + std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(), + [&](Path& p) { return p.logProb < minPathLogProb; }), finalPaths_[seqId].end()); for (auto p : finalPaths_[seqId]) { if (minFinalPathLogProb_[seqId] > p.logProb) { @@ -1030,7 +1099,7 @@ size_t RecurrentGradientMachine::beamShrink( } if (finalPaths_[seqId].size() >= getBeamSize() && - minFinalPathLogProb_[seqId] >= maxPathLogProb) { + minFinalPathLogProb_[seqId] >= maxPathLogProb) { newPaths.resize(totalExpandCount); return 0; } @@ -1067,7 +1136,8 @@ void RecurrentGradientMachine::fillGenOutputs() { // in beam search, here only reserved the top 1 generated result // for out_links that are not the generated word indices. batchMachineIdVec_.insert(batchMachineIdVec_.end(), - path.machineIdVec.begin(), path.machineIdVec.end()); + path.machineIdVec.begin(), + path.machineIdVec.end()); } } starts[i + 1] = generator_.ids.size(); @@ -1091,21 +1161,21 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) { void RecurrentGradientMachine::createDataOutlink( std::vector& machineIdVec) { - size_t seqNum = getBeamSize() > 1UL ? - finalPaths_.size() : finalPaths_[0].size(); + size_t seqNum = + getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size(); std::vector starts(seqNum + 1, 0); for (size_t i = 0; i < seqNum; ++i) { - size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() : - finalPaths_[0][i].ids.size(); + size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() + : finalPaths_[0][i].ids.size(); starts[i + 1] = starts[i] + seqLen; } for (size_t i = 0; i < dataArgsSize_; i++) { - dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, - starts, useGpu_, HPPL_STREAM_1, PASS_TEST); + dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_, + HPPL_STREAM_1, PASS_TEST); - auto dataAgent = dynamic_cast( - outFrameLines_[i + 1].agentLayer.get()); + auto dataAgent = + dynamic_cast(outFrameLines_[i + 1].agentLayer.get()); CHECK_NOTNULL(dataAgent); dataAgent->setData(dataArgs_[i]); } diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index cc49d13952..4ca545b504 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "GradientMachine.h" @@ -101,7 +100,7 @@ public: * Return true if this prefix or candidate is expected to be dropped. */ typedef std::function&, - const std::vector&)> DropCallback; + const std::vector&)> DropCallback; /** * @brief NormOrDropNodeCallback @@ -117,7 +116,7 @@ public: * The fourth parameter is the probability of the whole path. */ typedef std::function&, - std::vector&, real*)> NormOrDropNodeCallback; + std::vector&, real*)> NormOrDropNodeCallback; /** * @brief Register beam search control callbacks. Used for prediction. @@ -192,7 +191,7 @@ public: int machineId; // index of sample in frame int topIndex; // index of MaxIdLayer output in one sample - int seqId; // index of sequence in batch generation + int seqId; // index of sequence in batch generation std::vector machineIdVec; /** @@ -206,7 +205,10 @@ public: /** * @brief Path default ctor, first logProb is 0. */ - Path() { logProb = 0; seqId = 0; } + Path() { + logProb = 0; + seqId = 0; + } explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; } /** @@ -319,21 +321,33 @@ protected: }; std::vector memoryFrameLines_; - // All inFrameLines and outFrameLines have the same element as follows. + // Each inFrameLines(inlinks) has its own info(elements) below, + // and all outFrameLines(outlinks) share the info with one inFrameLine, + // which is assigned by targetInfoInlinkId_. struct Info { IVectorPtr allIds; // scattered id of realLayer std::vector idIndex; // index of allIds ICpuGpuVectorPtr - sequenceStartPositions; // scattered sequenceStartPositions + sequenceStartPositions; // scattered sequenceStartPositions std::vector seqStartPosIndex; // index of sequenceStartPositions }; - Info info_; + std::vector info_; - // if no subSeq, tuple of (seqLength, seqStart, seqIndex, seqIndex) - // else, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex) - std::vector> seqLengthAndStart_; + // each inlinks has a "std::vector>" denotes + // its sequence info: + // if hasSubSeq, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex) + // else, tuple of (seqLength, seqStart, seqIndex, seqIndex) + std::vector>> seqLengthAndStart_; - void createInFrameInfo(const Argument& input, PassType passType); + // the id of inlink which share info with outlinks + int targetInfoInlinkId_; + + /* create scattered id infomation for all realLayer of inFrameLines one time. + * If hasSubseq, will also create scattered sequenceStartPositions infomation + * for all realLayer of inFrameLines one time. + */ + void createInFrameInfo(int inlinks_id, const Argument& input, + PassType passType); void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine, PassType passType); @@ -363,6 +377,9 @@ protected: NeuralNetwork* rootNetwork_; bool reversed_; + + // if hasSubseq: max number of sentences(subseq)in batchsize samples + // else: max number of tokens in batchsize samples(sentences) int maxSequenceLength_; bool useGpu_; bool stopBeamSearch_; @@ -415,7 +432,7 @@ private: * @param machineIdVec : select a row of output matrix in each frame * that the generation process expanded. */ - void createDataOutlink(std::vector & machineIdVec); + void createDataOutlink(std::vector& machineIdVec); /* * @brief used in beam search, connect previous frame to form recurrent link diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index d04620d363..a2b243a786 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -452,6 +452,9 @@ message SubModelConfig { repeated LinkConfig out_links = 10; optional GeneratorConfig generator = 11; + + // the id of inlink which share info with outlinks, used in recurrent layer group + optional int32 target_inlinkid = 12; } message ModelConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index b26a63e7f3..aed317df67 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -303,7 +303,8 @@ def MakeLayerNameInSubmodel(name, submodel_name = None): @config_func def RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, - seq_reversed=False): + seq_reversed=False, + target_inlinkname=""): global g_current_submodel config_assert(g_config.model_config.type == "recurrent_nn", "RecurrentLayerGroup should be used only in recurrent_nn") @@ -311,14 +312,19 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name, SubModelBegin(name) g_current_submodel.is_recurrent_layer_group = True g_current_submodel.reversed = seq_reversed + g_current_submodel.target_inlinkid = -1 in_links_count = 0 - for link in in_links: + for linkid, link in enumerate(in_links): if isinstance(link, basestring): name = link has_subseq = False else: name = link.link_name has_subseq = link.has_subseq + # assign target_inlinkid according to target_inlinkname + if target_inlinkname == name: + g_current_submodel.target_inlinkid = linkid + if in_links_count == 0: in_links_has_subseq = has_subseq else: @@ -331,6 +337,7 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name, SequenceScatterAgentLayer(name=name, size=layer.size) else: ScatterAgentLayer(name=name, size=layer.size) + pair = g_current_submodel.in_links.add() pair.layer_name = layer_name pair.link_name = MakeLayerNameInSubmodel(name) @@ -362,10 +369,12 @@ def RecurrentLayerGroupBegin(name, in_links, out_links, generator=None, + target_inlinkname="", seq_reversed=False): RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, - seq_reversed) + seq_reversed, + target_inlinkname) for link in out_links: RecurrentLayerGroupSetOutLink(link) -- GitLab