diff --git a/doc/build/index.rst b/doc/build/index.rst index 2b983dceb2777e6c79ee1efaa977fef6e5c33ad6..d6d0d19e110fc35faec87da90d784a6775b9c91f 100644 --- a/doc/build/index.rst +++ b/doc/build/index.rst @@ -9,6 +9,7 @@ Install PaddlePaddle :glob: install_* + internal/install_from_jumbo.md Build from Source ----------------- diff --git a/doc/cluster/index.rst b/doc/cluster/index.rst index cf1ea97715402ec5b5b565a295ff4c1515df2570..9062f85f98d2981b5c8dcf8149e32c2ccdac77f4 100644 --- a/doc/cluster/index.rst +++ b/doc/cluster/index.rst @@ -5,3 +5,4 @@ Cluster Train :glob: opensource/cluster_train.md + internal/index.md diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst index e9182903c5f62b3a96c196d5ba1ebba2fd14f669..e21fc98c63dcdcda8202dad349ffe24dda62492d 100644 --- a/doc_cn/build_and_install/index.rst +++ b/doc_cn/build_and_install/index.rst @@ -9,7 +9,11 @@ Note: The intallation packages are still in pre-release state and your experienc .. toctree:: :maxdepth: 1 + :glob: + 源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst> + 使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst> + 从源码编译安装(对内) <../build/internal/build_from_source_zh_cn.rst> install/docker_install.rst install/ubuntu_install.rst cmake/index.rst diff --git a/doc_cn/cluster/index.rst b/doc_cn/cluster/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..25313a9635bbf567a1aedfac3c379802d601d283 --- /dev/null +++ b/doc_cn/cluster/index.rst @@ -0,0 +1,11 @@ +集群训练 +======== + +* `集群训练 <../../doc/cluster/index.html>`_ + +.. toctree:: + :maxdepth: 2 + :glob: + + 集群训练(对内) + diff --git a/doc_cn/index.rst b/doc_cn/index.rst index 5f06463899f6b7b8166ff2cccd87b17817c6f5d1..6cf5588b5b34f5e80ea4c70cc364d4c6c42cce3d 100644 --- a/doc_cn/index.rst +++ b/doc_cn/index.rst @@ -8,7 +8,7 @@ PaddlePaddle文档 * `用户接口 `_ * `使用示例 `_ * `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_ -* `集群训练 <../doc/cluster/index.html>`_ +* `集群训练 `_ 开发指南 -------- diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp index d0b1c0447d23d3e7072b2ee4f8e860708eb44bb2..cd4ed19c2ca45c310032c834da4cad56fb1cbdff 100644 --- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp +++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp @@ -194,8 +194,8 @@ public: virtual real evalImp(std::vector& arguments) { CHECK_EQ(arguments.size(), (size_t)2); Argument output, label; - output.resizeAndCopyFrom(arguments[0], false); - label.resizeAndCopyFrom(arguments[1], false); + output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT); + label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT); CHECK(label.sequenceStartPositions); CHECK(label.ids); diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 7bc5fe51813c94a4347118f1366370ec8b867e02..96b0e19880b68fa2fb0ac5ad55af6d00ed401224 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "paddle/utils/Util.h" #include "paddle/utils/Flags.h" @@ -291,6 +290,8 @@ void RecurrentGradientMachine::init( if (subModelConfig->evaluator_names_size() > 0) { evaluator_.reset(frames_[0]->makeEvaluator()); } + + targetInfoInlinkId_ = subModelConfig->target_inlinkid(); } void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) { @@ -325,7 +326,7 @@ void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) { for (int i = frames_.size(); i < numFrames; ++i) { std::unique_ptr frame( - NeuralNetwork::newNeuralNetwork(subModelName_)); + NeuralNetwork::newNeuralNetwork(subModelName_)); frame->init(config_, subParamInitCb); for (auto& inFrameLine : inFrameLines_) { @@ -382,6 +383,16 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, size_t numSequences = input.getNumSequences(); const int* starts = input.sequenceStartPositions->getData(false); bool hasSubseq = input.hasSubseq(); + + // In case of !hasSubseq or targetInfoInlinkId_ == -1, all inlinks share the + // same inframe info + bool shareInlinkInfo = !hasSubseq || targetInfoInlinkId_ == -1; + + // Defaultly, share info with the first inlink + if (shareInlinkInfo) { + targetInfoInlinkId_ = 0; + } + // check hasSubseq in both config and input are the same CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq); @@ -394,9 +405,17 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, CHECK_EQ((size_t)input1.getNumSequences(), numSequences); // check all inputs should have same hasSubseq flag CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq); - CHECK_EQ(input1.getBatchSize(), batchSize); - CHECK(std::equal(starts, starts + numSequences + 1, - input1.sequenceStartPositions->getData(false))); + + // if shareInlinkInfo, checks: + // 1. all inlinks have same number of total tokens + // 2. all inlinks have same number of tokens for each sentence of each + // sample. If hasSubseq, one sample has multiple sentence, else, one + // sample is one sentence + if (shareInlinkInfo) { + CHECK_EQ(input1.getBatchSize(), batchSize); + CHECK(std::equal(starts, starts + numSequences + 1, + input1.sequenceStartPositions->getData(false))); + } } if (hasSubseq) { @@ -408,19 +427,44 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, for (size_t i = 1; i < inFrameLines_.size(); ++i) { const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences); - CHECK(std::equal(subStarts, subStarts + numSubSequences + 1, - input1.subSequenceStartPositions->getData(false))); + if (shareInlinkInfo) { + CHECK(std::equal(subStarts, subStarts + numSubSequences + 1, + input1.subSequenceStartPositions->getData(false))); + } } } seqLengthAndStart_.clear(); - input.getSeqLengthAndStart(&seqLengthAndStart_, &maxSequenceLength_); + info_.clear(); + info_.resize(inFrameLines_.size()); + seqLengthAndStart_.resize(inFrameLines_.size()); + + { + AsyncGpuBlock asyncGpuBlock; + // if shareInlinkInfo, only calculate info of the first inlink + // else, calculate info for each inlink + if (shareInlinkInfo) { + input.getSeqLengthAndStart(&seqLengthAndStart_[0], &maxSequenceLength_); + createInFrameInfo(0, input, passType); + } else { + for (size_t i = 0; i < inFrameLines_.size(); i++) { + const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); + input1.getSeqLengthAndStart(&seqLengthAndStart_[i], + &maxSequenceLength_); + createInFrameInfo(i, input1, passType); + } + } + + // inFrameLine select rows in real layer one time + for (size_t i = 0; i < inFrameLines_.size(); i++) { + int curInlinkId = shareInlinkInfo ? 0 : i; + selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds, + &(inFrameLines_[i].outArg), passType); + } + } resizeOrCreateFrames(maxSequenceLength_); resizeBootFrame(numSequences); - AsyncGpuBlock asyncGpuBlock; - createInFrameInfo(input, passType); - for (auto& memoryFrameLine : memoryFrameLines_) { if (memoryFrameLine.rootAgent) { auto scatterAgent = @@ -443,23 +487,29 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, auto gatherAgent = dynamic_cast(outFrameLine.agentLayer.get()); CHECK_NOTNULL(gatherAgent); - gatherAgent->copyIdAndSequenceInfo(input, info_.allIds, info_.idIndex); + gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds, + info_[targetInfoInlinkId_].idIndex); } for (int i = 0; i < maxSequenceLength_; ++i) { - int idSize = info_.idIndex[i + 1] - info_.idIndex[i]; - + int idSize = 0; // connect in_links - for (auto& inFrameLine : inFrameLines_) { + for (size_t j = 0; j < inFrameLines_.size(); ++j) { + // idSize denotes the sum number of tokens in each length i + idSize = info_[j].idIndex[i + 1] - info_[j].idIndex[i]; + InFrameLine inFrameLine = inFrameLines_[j]; auto scatterAgent = dynamic_cast(inFrameLine.agents[i].get()); scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer, - inFrameLine.outArg, info_.allIds, - info_.idIndex[i], idSize); + inFrameLine.outArg, info_[j].allIds, + info_[j].idIndex[i], idSize); if (hasSubseq) { - int size = info_.seqStartPosIndex[i + 1] - info_.seqStartPosIndex[i]; - scatterAgent->setSequenceStartPositions( - info_.sequenceStartPositions, info_.seqStartPosIndex[i], size); + // size: the length of subsequence + int size = + info_[j].seqStartPosIndex[i + 1] - info_[j].seqStartPosIndex[i]; + scatterAgent->setSequenceStartPositions(info_[j].sequenceStartPositions, + info_[j].seqStartPosIndex[i], + size); } } @@ -469,13 +519,16 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, dynamic_cast(outFrameLine.agentLayer.get()); gatherAgent->addRealLayer(outFrameLine.frames[i]); } - // connect memory links + // Adopt info_[0].idIndex because seq which has_subseq=True + // doesn't support Memory with !hasSubseq bootlayer; + // And inlinks that !hasSubSeq must have same inlink length. + idSize = info_[0].idIndex[i + 1] - info_[0].idIndex[i]; for (auto& memoryFrameLine : memoryFrameLines_) { NeuralNetwork::connect( memoryFrameLine.agents[i], i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1], - idSize /*height of agent*/); + numSeqs_[i] /*height of agent*/); } } @@ -560,62 +613,76 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() { * If hasSubseq, will also create scattered sequenceStartPositions infomation * for all realLayer of inFrameLines one time. */ -void RecurrentGradientMachine::createInFrameInfo(const Argument& input, + +void RecurrentGradientMachine::createInFrameInfo(int inlinks_id, + const Argument& input, PassType passType) { bool hasSubseq = input.hasSubseq(); + // numSequences: # samples(sequences) in a batch size_t numSequences = input.getNumSequences(); std::vector allIds; - info_.idIndex.clear(); - info_.idIndex.push_back(0); // first idIndex = 0 - if (hasSubseq) { // for sequenceScatterAgentLayer + + numSeqs_.clear(); + Info* inlink_info = &info_[inlinks_id]; + inlink_info->idIndex.clear(); + inlink_info->idIndex.push_back(0); // first idIndex = 0 + if (hasSubseq) { // for sequenceScatterAgentLayer + // numSubSequences : all sentences within all samples(batch) size_t numSubSequences = input.getNumSubSequences(); std::vector sequenceStartPositions; - info_.seqStartPosIndex.clear(); - info_.seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 + inlink_info->seqStartPosIndex.clear(); + inlink_info->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 + // maxSequenceLength_: max number of sentences(subseq) in allsamples for (int i = 0; i < maxSequenceLength_; ++i) { - sequenceStartPositions.push_back(0); // first element = 0 - for (size_t j = 0; j < numSubSequences; ++j) { - if (std::get<3>(seqLengthAndStart_[j]) == i) { - int subSeqStart = std::get<1>(seqLengthAndStart_[j]); - int subSeqLength = std::get<0>(seqLengthAndStart_[j]); + sequenceStartPositions.push_back(0); // first element = 0 + int numSeqs = 0; + for (size_t j = 0; j < numSubSequences; ++j) { // for each sentence + // seqLengthAndStart_[inlinks_id][j]: + // a 4-tuple including + if (std::get<3>(seqLengthAndStart_[inlinks_id][j]) == i) { + ++numSeqs; + // subseqstart: the cpuSubSequenceStartPositions of this subseq + int subSeqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]); + int subSeqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]); for (int k = subSeqStart; k < subSeqStart + subSeqLength; ++k) { allIds.push_back(k); } sequenceStartPositions.push_back(sequenceStartPositions.back() + - subSeqLength); + subSeqLength); } } - info_.idIndex.push_back(allIds.size()); - info_.seqStartPosIndex.push_back(sequenceStartPositions.size()); + inlink_info->idIndex.push_back(allIds.size()); + inlink_info->seqStartPosIndex.push_back(sequenceStartPositions.size()); + numSeqs_.push_back(numSeqs); } // inFrameLine create sequenceStartPositions one time CHECK_EQ(sequenceStartPositions.size(), maxSequenceLength_ + numSubSequences); - CHECK_EQ(info_.seqStartPosIndex.size(), + CHECK_EQ(inlink_info->seqStartPosIndex.size(), static_cast(maxSequenceLength_ + 1)); - createSeqPos(sequenceStartPositions, &info_.sequenceStartPositions); + createSeqPos(sequenceStartPositions, &inlink_info->sequenceStartPositions); } else { // for scatterAgentLayer for (int i = 0; i < maxSequenceLength_; ++i) { + int numSeqs = 0; for (size_t j = 0; j < numSequences; ++j) { - int seqLength = std::get<0>(seqLengthAndStart_[j]); + int seqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]); if (i >= seqLength) { break; } - int seqStart = std::get<1>(seqLengthAndStart_[j]); + ++numSeqs; + int seqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]); allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i) : (seqStart + i)); } - info_.idIndex.push_back(allIds.size()); + inlink_info->idIndex.push_back(allIds.size()); + numSeqs_.push_back(numSeqs); } } + // copy and check scatterId - copyScattedId(allIds, &info_.allIds, input.getBatchSize()); - CHECK_EQ(info_.idIndex.size(), static_cast(maxSequenceLength_ + 1)); - // inFrameLine select rows in real layer one time - for (auto& inFrameLine : inFrameLines_) { - selectRowsOneTime(inFrameLine.inLayer, info_.allIds, &inFrameLine.outArg, - passType); - } + copyScattedId(allIds, &inlink_info->allIds, input.getBatchSize()); + CHECK_EQ(inlink_info->idIndex.size(), + static_cast(maxSequenceLength_ + 1)); } /* like createInFrameInfo, but for all realLayer of memoryFrameLines*/ @@ -633,19 +700,20 @@ void RecurrentGradientMachine::createMemoryFrameInfo( sequenceStartPositions.push_back(0); // first element = 0 const int* starts = input.sequenceStartPositions->getData(false); for (size_t i = 0; i < numSequences; ++i) { - int seqId = std::get<2>(seqLengthAndStart_[i]); + // memory info adopt info of inlinks[0] + int seqId = std::get<2>(seqLengthAndStart_[0][i]); for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) { allIds.push_back(k); } sequenceStartPositions.push_back(sequenceStartPositions.back() + - starts[seqId + 1] - starts[seqId]); + starts[seqId + 1] - starts[seqId]); } createSeqPos(sequenceStartPositions, &(*memoryFrameLine).sequenceStartPositions); } else { // for scatterAgentLayer for (size_t i = 0; i < numSequences; ++i) { - allIds.push_back(std::get<2>(seqLengthAndStart_[i])); + allIds.push_back(std::get<2>(seqLengthAndStart_[0][i])); } } // copy and check scatterId @@ -699,18 +767,19 @@ size_t RecurrentGradientMachine::getGenBatchSize() { for (auto& memoryFrameLine : memoryFrameLines_) { if (!memoryFrameLine.rootLayer) continue; Argument& bootArg = memoryFrameLine.rootLayer->getOutput(); - size_t batchSize = memoryFrameLine.is_sequence ? - bootArg.getNumSequences() : bootArg.getBatchSize(); + size_t batchSize = memoryFrameLine.is_sequence ? bootArg.getNumSequences() + : bootArg.getBatchSize(); if (numSequences) { CHECK_EQ(numSequences, batchSize); } else { numSequences = batchSize; } } - CHECK(numSequences) << "Fail to get batch size in generation. " - "At least one of the Memory layer MUST have a layer that is NOT in " - "the layer group to boot it, and this boot layer is used to " - "decide batch_size in generation process."; + CHECK(numSequences) + << "Fail to get batch size in generation. " + "At least one of the Memory layer MUST have a layer that is NOT in " + "the layer group to boot it, and this boot layer is used to " + "decide batch_size in generation process."; return numSequences; } @@ -732,7 +801,9 @@ void RecurrentGradientMachine::generateSequence() { // connect boot frame memory links std::vector ids(numSequences); - for (size_t i = 0; i < numSequences; ++i) { ids[i] = i; } + for (size_t i = 0; i < numSequences; ++i) { + ids[i] = i; + } for (auto& memoryFrameLine : memoryFrameLines_) { if (memoryFrameLine.rootAgent) { auto scatterAgent = @@ -756,7 +827,8 @@ void RecurrentGradientMachine::generateSequence() { // init outArg size_t resultNum = generator_.config.num_results_per_sample(); - IVector::resizeOrCreate(generator_.outArg.ids, + IVector::resizeOrCreate( + generator_.outArg.ids, generator_.config.max_num_frames() * numSequences * resultNum, false); if (resultNum > 1) { CHECK_LE(resultNum, static_cast(generator_.config.beam_size())); @@ -847,7 +919,9 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { // path.seqId = -1 indicates end of generation // of an input sequence finalPaths[seqIds_[j]].seqId = -1; - } else { scatterIds.push_back(j); } + } else { + scatterIds.push_back(j); + } } } @@ -856,13 +930,12 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { starts[0] = 0; generator_.ids.clear(); for (size_t i = 0; i < batchSize; ++i) { - generator_.ids.insert(generator_.ids.end(), - finalPaths[i].ids.begin(), + generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(), finalPaths[i].ids.end()); starts[i + 1] = generator_.ids.size(); batchMachineIdVec_.insert(batchMachineIdVec_.end(), - finalPaths[i].machineIdVec.begin(), - finalPaths[i].machineIdVec.end()); + finalPaths[i].machineIdVec.begin(), + finalPaths[i].machineIdVec.end()); } } @@ -920,9 +993,9 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) { } } -void RecurrentGradientMachine::singlePathExpand( - Path& curPath, size_t curPathId, std::vector& newPaths, - size_t expandWidth) { +void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId, + std::vector& newPaths, + size_t expandWidth) { int calc_id = gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0; @@ -946,19 +1019,20 @@ void RecurrentGradientMachine::singlePathExpand( if (id == -1) break; real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob; - Path newPath(curPath, id, newLogProb, - curPathId /*machineId*/, k /*topIndex*/); + Path newPath(curPath, id, newLogProb, curPathId /*machineId*/, + k /*topIndex*/); if (this->beamSearchCtrlCallbacks_) { if (beamSearchCtrlCallbacks_->stopDetermineCandidates( - newPath.seqId, newPath.ids, newPath.probHistory)) return; + newPath.seqId, newPath.ids, newPath.probHistory)) + return; } // outFrameLines_.size() > 1UL if (dataArgsSize_) { newPath.machineIdVec = curPath.machineIdVec; newPath.machineIdVec.push_back(curPathId); } - bool atEos = eosVec[index] == 1U || - newPath.ids.size() >= (size_t)maxSequenceLength_; + bool atEos = + eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_; // adjustNewPath newPath.adjustProb(calc_id, atEos); if (this->beamSearchCtrlCallbacks_) { @@ -966,16 +1040,18 @@ void RecurrentGradientMachine::singlePathExpand( newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb); } if (!newPath.isDropable()) { - atEos ? finalPaths_[curPath.seqId].push_back(newPath) : - newPaths.push_back(newPath); + atEos ? finalPaths_[curPath.seqId].push_back(newPath) + : newPaths.push_back(newPath); } } // for expandWidth - if (gDiyProbStop) { gDiyProbStop(calc_id); } + if (gDiyProbStop) { + gDiyProbStop(calc_id); + } } -void RecurrentGradientMachine::beamExpand( - std::vector& paths, std::vector& newPaths) { +void RecurrentGradientMachine::beamExpand(std::vector& paths, + std::vector& newPaths) { size_t candidatePathCount = paths.size(); // idVec.size() could be larger than candidatePathCount * beam, // so user can drop some node customly. @@ -988,7 +1064,7 @@ void RecurrentGradientMachine::beamExpand( int curSeqId = 0; for (size_t j = 0; j <= candidatePathCount; j++) { // expansions of a single sequence are all processed - curSeqId = (j < candidatePathCount? paths[j].seqId : curSeqId + 1); + curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1); if (prevSeqId != -1 && curSeqId != prevSeqId) { totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount); } @@ -1000,11 +1076,14 @@ void RecurrentGradientMachine::beamExpand( } // Drop extra nodes to beam size. -size_t RecurrentGradientMachine::beamShrink( - std::vector& newPaths, size_t seqId, size_t totalExpandCount) { - size_t minNewPathSize = std::min(getBeamSize(), - newPaths.size() - totalExpandCount); - if (!minNewPathSize) { return 0; } +size_t RecurrentGradientMachine::beamShrink(std::vector& newPaths, + size_t seqId, + size_t totalExpandCount) { + size_t minNewPathSize = + std::min(getBeamSize(), newPaths.size() - totalExpandCount); + if (!minNewPathSize) { + return 0; + } std::nth_element(newPaths.begin() + totalExpandCount, newPaths.begin() + totalExpandCount + minNewPathSize, newPaths.end(), Path::greaterPath); @@ -1017,11 +1096,8 @@ size_t RecurrentGradientMachine::beamShrink( // Remove the already formed paths that are relatively short finalPaths_[seqId].erase( - std::remove_if(finalPaths_[seqId].begin(), - finalPaths_[seqId].end(), - [&](Path& p) { - return p.logProb < minPathLogProb; - }), + std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(), + [&](Path& p) { return p.logProb < minPathLogProb; }), finalPaths_[seqId].end()); for (auto p : finalPaths_[seqId]) { if (minFinalPathLogProb_[seqId] > p.logProb) { @@ -1030,7 +1106,7 @@ size_t RecurrentGradientMachine::beamShrink( } if (finalPaths_[seqId].size() >= getBeamSize() && - minFinalPathLogProb_[seqId] >= maxPathLogProb) { + minFinalPathLogProb_[seqId] >= maxPathLogProb) { newPaths.resize(totalExpandCount); return 0; } @@ -1067,7 +1143,8 @@ void RecurrentGradientMachine::fillGenOutputs() { // in beam search, here only reserved the top 1 generated result // for out_links that are not the generated word indices. batchMachineIdVec_.insert(batchMachineIdVec_.end(), - path.machineIdVec.begin(), path.machineIdVec.end()); + path.machineIdVec.begin(), + path.machineIdVec.end()); } } starts[i + 1] = generator_.ids.size(); @@ -1091,21 +1168,21 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) { void RecurrentGradientMachine::createDataOutlink( std::vector& machineIdVec) { - size_t seqNum = getBeamSize() > 1UL ? - finalPaths_.size() : finalPaths_[0].size(); + size_t seqNum = + getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size(); std::vector starts(seqNum + 1, 0); for (size_t i = 0; i < seqNum; ++i) { - size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() : - finalPaths_[0][i].ids.size(); + size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() + : finalPaths_[0][i].ids.size(); starts[i + 1] = starts[i] + seqLen; } for (size_t i = 0; i < dataArgsSize_; i++) { - dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, - starts, useGpu_, HPPL_STREAM_1, PASS_TEST); + dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_, + HPPL_STREAM_1, PASS_TEST); - auto dataAgent = dynamic_cast( - outFrameLines_[i + 1].agentLayer.get()); + auto dataAgent = + dynamic_cast(outFrameLines_[i + 1].agentLayer.get()); CHECK_NOTNULL(dataAgent); dataAgent->setData(dataArgs_[i]); } diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index cc49d13952323db6e514ea437552d076187d91e2..d9901ad81d60ae7c30ac89e3fc34ef6c179b5dd5 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "GradientMachine.h" @@ -101,7 +100,7 @@ public: * Return true if this prefix or candidate is expected to be dropped. */ typedef std::function&, - const std::vector&)> DropCallback; + const std::vector&)> DropCallback; /** * @brief NormOrDropNodeCallback @@ -117,7 +116,7 @@ public: * The fourth parameter is the probability of the whole path. */ typedef std::function&, - std::vector&, real*)> NormOrDropNodeCallback; + std::vector&, real*)> NormOrDropNodeCallback; /** * @brief Register beam search control callbacks. Used for prediction. @@ -192,7 +191,7 @@ public: int machineId; // index of sample in frame int topIndex; // index of MaxIdLayer output in one sample - int seqId; // index of sequence in batch generation + int seqId; // index of sequence in batch generation std::vector machineIdVec; /** @@ -206,7 +205,10 @@ public: /** * @brief Path default ctor, first logProb is 0. */ - Path() { logProb = 0; seqId = 0; } + Path() { + logProb = 0; + seqId = 0; + } explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; } /** @@ -319,21 +321,37 @@ protected: }; std::vector memoryFrameLines_; - // All inFrameLines and outFrameLines have the same element as follows. + // Each inFrameLines(inlinks) has its own info(elements) below, + // and all outFrameLines(outlinks) share the info with one inFrameLine, + // which is assigned by targetInfoInlinkId_. struct Info { IVectorPtr allIds; // scattered id of realLayer std::vector idIndex; // index of allIds ICpuGpuVectorPtr - sequenceStartPositions; // scattered sequenceStartPositions + sequenceStartPositions; // scattered sequenceStartPositions std::vector seqStartPosIndex; // index of sequenceStartPositions }; - Info info_; + std::vector info_; + + // numSeqs_[i] is the number sequences which is longer than i (for sequence + // data) or has more than i subsequences (for subsequence data) + std::vector numSeqs_; - // if no subSeq, tuple of (seqLength, seqStart, seqIndex, seqIndex) - // else, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex) - std::vector> seqLengthAndStart_; + // each inlinks has a "std::vector>" denotes + // its sequence info: + // if hasSubSeq, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex) + // else, tuple of (seqLength, seqStart, seqIndex, seqIndex) + std::vector>> seqLengthAndStart_; - void createInFrameInfo(const Argument& input, PassType passType); + // the id of inlink which share info with outlinks + int targetInfoInlinkId_; + + /* create scattered id infomation for all realLayer of inFrameLines one time. + * If hasSubseq, will also create scattered sequenceStartPositions infomation + * for all realLayer of inFrameLines one time. + */ + void createInFrameInfo(int inlinks_id, const Argument& input, + PassType passType); void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine, PassType passType); @@ -363,6 +381,9 @@ protected: NeuralNetwork* rootNetwork_; bool reversed_; + + // if hasSubseq: max number of sentences(subseq)in batchsize samples + // else: max number of tokens in batchsize samples(sentences) int maxSequenceLength_; bool useGpu_; bool stopBeamSearch_; @@ -415,7 +436,7 @@ private: * @param machineIdVec : select a row of output matrix in each frame * that the generation process expanded. */ - void createDataOutlink(std::vector & machineIdVec); + void createDataOutlink(std::vector& machineIdVec); /* * @brief used in beam search, connect previous frame to form recurrent link diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp index db1450694ecf7608fb37790e841b967288378e1f..6b9ffc5c749fb45be567881b8e625b48e28f69b4 100644 --- a/paddle/gserver/layers/CTCLayer.cpp +++ b/paddle/gserver/layers/CTCLayer.cpp @@ -49,8 +49,10 @@ void CTCLayer::forward(PassType passType) { Layer::forward(passType); if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } + hl_stream_synchronize(HPPL_STREAM_DEFAULT); forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]); } else { forwardImp(getInput(0), getInput(1)); @@ -92,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) { if (useGpu_) { backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]); const_cast(getInput(0)). - resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1); + resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT); const_cast(getInput(1)). - resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1); + resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT); } else { backwardImp(callback, getInput(0), getInput(1)); } diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp index d08c422764e5642816a94fc55b5b67445ffb42f7..8c72c1778451dfddbaa740921cd08cf73fe56785 100644 --- a/paddle/gserver/layers/ConvOperator.cpp +++ b/paddle/gserver/layers/ConvOperator.cpp @@ -248,7 +248,7 @@ void ConvOperator::forward() { CHECK_EQ(ins_[1]->value->getHeight(), batchSize); checkFilterSize(ins_[1]->value); Matrix::resizeOrCreate(out_->value, batchSize, - outputH_ * outputW_ * numFilters_); + outputH_ * outputW_ * numFilters_, false, useGpu_); { AsyncGpuBlock block; for (size_t batchId = 0; batchId < batchSize; ++batchId) { diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index f353afabb3b7162783fef4f9093630fb826c86cb..0f99aee03200c3834c7c27343f41f77edc5a558e 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -509,8 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, Matrix &cost) { if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } + hl_stream_synchronize(HPPL_STREAM_DEFAULT); } forwardImpIn(output, label, cost); } diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp index 41c1461967ae1c0ff3c4b3a11e8f7405b58f6ab9..b39c9948b53118b51090059fc554e76f94316f81 100644 --- a/paddle/gserver/layers/SamplingIdLayer.cpp +++ b/paddle/gserver/layers/SamplingIdLayer.cpp @@ -52,8 +52,10 @@ public: Layer::forward(passType); if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } + hl_stream_synchronize(HPPL_STREAM_DEFAULT); forwardImp(tmpCpuInput_[0]); } else { forwardImp(getInput(0)); diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index f72011ae16cb3bac73e8acd5338bd7a179da329b..552a6c5b41c7f896c52b2132578b136200967573 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector& dataLayers, testLayer->forward(PASS_TEST); Argument out; out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); if (batchOut.value) { size_t dim = batchOut.value->getWidth(); ASSERT_TRUE((bool)out.value); @@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector& dataLayers, testLayer->forward(PASS_TEST); Argument out; out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); if (batchOut.value) { size_t dim = batchOut.value->getWidth(); ASSERT_TRUE((bool)out.value); diff --git a/paddle/gserver/tests/Sequence/dummy.list b/paddle/gserver/tests/Sequence/dummy.list new file mode 100644 index 0000000000000000000000000000000000000000..0e52665e11298965df5738f69c5bcefcc8bab0f9 --- /dev/null +++ b/paddle/gserver/tests/Sequence/dummy.list @@ -0,0 +1 @@ +dummy_file_no_use diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..85a83554c5c3045d144ee0250d2808237eccc9e0 --- /dev/null +++ b/paddle/gserver/tests/rnn_data_provider.py @@ -0,0 +1,35 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer.PyDataProvider2 import * + +data = [ + [[[1, 3, 2], [4, 5, 2]], 0], + [[[0, 2], [2, 5], [0, 1, 2]], 1], +] + +@provider(input_types=[integer_value_sub_sequence(10), + integer_value(2)]) +def process_subseq(settings, file_name): + for d in data: + yield d + +@provider(input_types=[integer_value_sequence(10), + integer_value(2)]) +def process_seq(settings, file_name): + for d in data: + seq = [] + for subseq in d[0]: + seq += subseq + yield seq, d[1] diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py index e4727e472d446b48e6001968841bfc178e34ec0c..cb83d79d78cc677d5ffeb77f5693d08da2a51668 100644 --- a/paddle/gserver/tests/sequenceGen.py +++ b/paddle/gserver/tests/sequenceGen.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python -#coding=utf-8 - # Copyright (c) 2016 Baidu, Inc. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf new file mode 100644 index 0000000000000000000000000000000000000000..03eef7a2175880529b2743adc3ca2aaf3622c095 --- /dev/null +++ b/paddle/gserver/tests/sequence_nest_rnn.conf @@ -0,0 +1,75 @@ +#edit-mode: -*- python -*- +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', + test_list=None, + module='rnn_data_provider', + obj='process_subseq') + + +settings(batch_size=2, learning_rate=0.01) +######################## network configure ################################ +dict_dim = 10 +word_dim = 8 +hidden_dim = 8 +label_dim = 3 + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer(input=data, size=word_dim) + +# This hierachical RNN is designed to be equivalent to the simple RNN in +# sequence_rnn.conf + +def outer_step(x): + outer_mem = memory(name="outer_rnn_state", size=hidden_dim) + def inner_step(y): + inner_mem = memory(name="inner_rnn_state", + size=hidden_dim, + boot_layer=outer_mem) + return fc_layer(input=[y, inner_mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name="inner_rnn_state") + + inner_rnn_output = recurrent_group( + step=inner_step, + input=x) + last = last_seq(input=inner_rnn_output, name="outer_rnn_state") + + # "return last" should also work. But currently RecurrentGradientMachine + # does not handle it correctly. Current implementation requires that + # all the out links are from sequences. However, it does not report error + # when the out links are not sequences. + return inner_rnn_output + +out = recurrent_group( + step=outer_step, + input=SubsequenceInput(emb)) + +value_printer_evaluator(input=out) + +rep = last_seq(input=out) +prob = fc_layer(size=label_dim, + input=rep, + act=SoftmaxActivation(), + bias_attr=True) + +outputs(classification_cost(input=prob, + label=data_layer(name="label", size=label_dim))) diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf new file mode 100644 index 0000000000000000000000000000000000000000..73e7a5935f6c3cf1857d2b952471bb969d86211a --- /dev/null +++ b/paddle/gserver/tests/sequence_rnn.conf @@ -0,0 +1,57 @@ +#edit-mode: -*- python -*- +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +######################## data source ################################ +define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', + test_list=None, + module='rnn_data_provider', + obj='process_seq') + + +settings(batch_size=2, learning_rate=0.01) +######################## network configure ################################ +dict_dim = 10 +word_dim = 8 +hidden_dim = 8 +label_dim = 3 + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer(input=data, size=word_dim) + +def step(y): + mem = memory(name="rnn_state", size=hidden_dim) + return fc_layer(input=[y, mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name="rnn_state") + +out = recurrent_group( + step=step, + input=emb) + +value_printer_evaluator(input=out) + +rep = last_seq(input=out) +prob = fc_layer(size=label_dim, + input=rep, + act=SoftmaxActivation(), + bias_attr=True) + +outputs(classification_cost(input=prob, + label=data_layer(name="label", size=label_dim))) diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp index 35d6ee7f4a402d198dbcd1df7b272dcd65723659..f6989e9a6463a286f5df43e951b2d79e74c3d16a 100644 --- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp +++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp @@ -21,6 +21,8 @@ limitations under the License. */ #include #include +P_DECLARE_int32(seed); + using namespace paddle; // NOLINT using namespace std; // NOLINT class TrainerForTest : public paddle::Trainer { @@ -68,7 +70,9 @@ void CalCost(const string& conf, const string& dir, real* cost, CpuVector vecMomentum(dim); // vecW needs to be assigned, otherwise the variable is an uncertain value. - vecW.zeroMem(); + + *ThreadLocalRand::getSeed() = FLAGS_seed; + vecW.randnorm(0, 0.1); trainer.startTrain(); for (int i = 0; i < num_passes; ++i) { @@ -88,15 +92,13 @@ void CalCost(const string& conf, const string& dir, real* cost, rmDir(dir.c_str()); } -TEST(RecurrentGradientMachine, HasSubSequence) { +void test(const string& conf1, const string& conf2) { int num_passes = 5; real* cost1 = new real[num_passes]; - const string conf1 = "gserver/tests/sequence_layer_group.conf"; const string dir1 = "gserver/tests/t1"; CalCost(conf1, dir1, cost1, num_passes); real* cost2 = new real[num_passes]; - const string conf2 = "gserver/tests/sequence_nest_layer_group.conf"; const string dir2 = "gserver/tests/t2"; CalCost(conf2, dir2, cost2, num_passes); @@ -109,6 +111,17 @@ TEST(RecurrentGradientMachine, HasSubSequence) { delete[] cost2; } +TEST(RecurrentGradientMachine, HasSubSequence) { + test("gserver/tests/sequence_layer_group.conf", + "gserver/tests/sequence_nest_layer_group.conf"); +} + +TEST(RecurrentGradientMachine, rnn) { + test("gserver/tests/sequence_rnn.conf", + "gserver/tests/sequence_nest_rnn.conf"); +} + + int main(int argc, char** argv) { if (paddle::version::isWithPyDataProvider()) { if (!paddle::version::isWithGpu()) { diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 2cea190b859496cd635fc5a8d1834779537d50e6..9b933b153d158bef565c0964232525ba99b8b3d4 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, Argument& cpuInput = testCpu.dataLayer_->getOutput(); Argument& gpuInput = testGpu.dataLayer_->getOutput(); gpuInput.resizeAndCopyFrom(cpuInput, true); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE); const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index f3a6503d4a21ff8766f3289f8eee992d4d13045d..1b7f9ac5dac16c167dcc22930c28bc3521162b9b 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width, if (!matrix) { matrix = Matrix::create(height, width, trans, useGpu); } else { + CHECK_EQ(matrix->useGpu(), useGpu); matrix->resize(height, width); } } @@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height, } else { CHECK(dynamic_cast(matrix.get()) || dynamic_cast(matrix.get())); + CHECK_EQ(matrix->useGpu(), useGpu); matrix->resize(height, width, nnz, valueType, format); } } diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp index b1a459b86aa4ff70e4e07267c8a902123f9d17c0..7553ea25e09d2f52f1f8b9205f954510b77cbfa9 100644 --- a/paddle/math/Vector.cpp +++ b/paddle/math/Vector.cpp @@ -800,6 +800,7 @@ void CpuGpuVectorT::resizeOrCreate(size_t size, bool useGpu) { } else if ((!useGpu) && (!cpuVectorT_)) { cpuVectorT_ = VectorT::create(size, false); } else { + CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_)); this->resize(size, useGpu); } } diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 8610a66452358e1b2e2a846ddfcf62a0ce99e22e..a81c72aacbed3f4077afd5d604d16c922cf07013 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -25,6 +25,7 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, if (!dest) { dest = src->clone(0, 0, useGpu); } else { + CHECK_EQ(dest->useGpu(), useGpu); dest->resize(src->getHeight(), src->getWidth()); } dest->copyFrom(*src, stream); @@ -60,12 +61,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, hl_stream_t stream = HPPL_STREAM_DEFAULT) { if (src) { CHECK_LE((size_t)startRow + copySize, src->getHeight()); - int height = copySize; int width = src->getWidth(); if (!dest) { dest = src->clone(height, width, useGpu); } else { + CHECK_EQ(dest->useGpu(), useGpu); dest->resize(height, width); } MatrixPtr submat = src->subMatrix(startRow, copySize); @@ -182,6 +183,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, } } +void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) { + resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); +} + void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream) { dataId = src.dataId; @@ -199,6 +205,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, resizeAndCopy(strs, src.strs, useGpu, stream); } +int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, + int32_t copySize, bool useGpu) { + int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu, + HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); + return size; +} + int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t copySize, bool useGpu, hl_stream_t stream) { diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index c444ebaf12930e938a3a4d75541d0fbf5bbb01ac..5474a05b84101ab46e0d1826626dd59399a328a9 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -203,13 +203,28 @@ struct Argument { * startSeq: the sample id of start * copySize: how many samples need to copy * return value: how many samples are copied + * Note that when specifying the stream explicitly in this case, + * synchronize should also be called somewhere after this function */ int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, - int32_t copySize, bool useGpu = FLAGS_use_gpu, - hl_stream_t stream = HPPL_STREAM_DEFAULT); + int32_t copySize, bool useGpu, hl_stream_t stream); - void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu, - hl_stream_t stream = HPPL_STREAM_DEFAULT); + /* + * same with the above function, except that the stream is + * HPPL_STREAM_DEFAULT and synchronize is automatically called + * inside it + */ + int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, + int32_t copySize, bool useGpu = FLAGS_use_gpu); + + void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream); + + /* + * same with the above function, except that the stream is + * HPPL_STREAM_DEFAULT and synchronize is automatically called + * inside it + */ + void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu); /* @brief Concatenate several arguments into one and put the result into it. @@ -240,6 +255,15 @@ struct Argument { /* Get Sequence Length, startPositions and max Length according to input + 1. For sequence data: + Each tuple is (seq_length, seq_start, seq_id, seq_id) + The tuples are sorted according to seq_length or subseq_length + *maxSequenceLength is the maximal sequence length + + 2. For subsequence data: + Each tuple is (subseq_length, subseq_start, seq_id, subseq_id) + The tuples are not sorted. They are in the original order. + *maxSequenceLenth is the maximal number of subsequences in each sequence. */ void getSeqLengthAndStart( std::vector>* seqLengthAndStart, diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index d04620d363c14923455d68734b03ef9bb3f28f78..a2b243a7869eaff120b25ece35e95be4d4284d18 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -452,6 +452,9 @@ message SubModelConfig { repeated LinkConfig out_links = 10; optional GeneratorConfig generator = 11; + + // the id of inlink which share info with outlinks, used in recurrent layer group + optional int32 target_inlinkid = 12; } message ModelConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5b60cf8410e0cd4270f7a1713afdc83027ffba10..7a6053940178db472999ed7a0d4c49cce40680e4 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -303,7 +303,8 @@ def MakeLayerNameInSubmodel(name, submodel_name = None): @config_func def RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, - seq_reversed=False): + seq_reversed=False, + target_inlinkname=""): global g_current_submodel config_assert(g_config.model_config.type == "recurrent_nn", "RecurrentLayerGroup should be used only in recurrent_nn") @@ -311,14 +312,19 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name, SubModelBegin(name) g_current_submodel.is_recurrent_layer_group = True g_current_submodel.reversed = seq_reversed + g_current_submodel.target_inlinkid = -1 in_links_count = 0 - for link in in_links: + for linkid, link in enumerate(in_links): if isinstance(link, basestring): name = link has_subseq = False else: name = link.link_name has_subseq = link.has_subseq + # assign target_inlinkid according to target_inlinkname + if target_inlinkname == name: + g_current_submodel.target_inlinkid = linkid + if in_links_count == 0: in_links_has_subseq = has_subseq else: @@ -331,6 +337,7 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name, SequenceScatterAgentLayer(name=name, size=layer.size) else: ScatterAgentLayer(name=name, size=layer.size) + pair = g_current_submodel.in_links.add() pair.layer_name = layer_name pair.link_name = MakeLayerNameInSubmodel(name) @@ -362,10 +369,12 @@ def RecurrentLayerGroupBegin(name, in_links, out_links, generator=None, + target_inlinkname="", seq_reversed=False): RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, - seq_reversed) + seq_reversed, + target_inlinkname) for link in out_links: RecurrentLayerGroupSetOutLink(link)