diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 9a972466d66ba1417b2c31e66dc375b3da229aa8..41e0929959bafa070190b0c3ab32bee0efc0d737 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -967,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() { size_t numSequences = getGenBatchSize(); resizeBootFrame(numSequences); - // We create only two sub-network in generation for alternate use. - // Thus, we can reduce total memory of output_ in layer forward. + // We create only two sub-network in generation, one stores states of all + // layers in previous time step and the other storing the states at current + // time step. resizeOrCreateFrames(2); // outFrameLines_.size() > 1UL @@ -1001,10 +1002,9 @@ void RecurrentGradientMachine::generateSequence() { // init outArg size_t resultNum = generator_.config.num_results_per_sample(); - IVector::resizeOrCreate( - generator_.outArg.ids, - generator_.config.max_num_frames() * numSequences * resultNum, - false); + size_t maxGenWordCount = + generator_.config.max_num_frames() * numSequences * resultNum; + IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false); if (resultNum > 1) { CHECK_LE(resultNum, static_cast(generator_.config.beam_size())); Matrix::resizeOrCreate(generator_.outArg.in, @@ -1012,6 +1012,11 @@ void RecurrentGradientMachine::generateSequence() { /* width */ resultNum, false, /* useGpu */ false); + Matrix::resizeOrCreate(generator_.outArg.value, + /* height */ maxGenWordCount, + /* width */ 1, + false, + /* useGpu */ false); } ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions, numSequences + 1, @@ -1313,13 +1318,20 @@ void RecurrentGradientMachine::fillGenOutputs() { starts[0] = 0; if (numResults > 1) { real* probs = generator_.outArg.in->getData(); + real* idsProb = generator_.outArg.value->getData(); + size_t curPos = 0; for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t j = 0; j < finalPaths_[i].size(); ++j) { Path& path = finalPaths_[i][j]; - generator_.ids.push_back(path.ids.size()); // sequence size + size_t genLen = path.ids.size(); + generator_.ids.push_back(genLen); // sequence size generator_.ids.insert( generator_.ids.end(), path.ids.begin(), path.ids.end()); generator_.ids.push_back(-1); // end of sequence + + memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen); + curPos += genLen; + idsProb[curPos++] = -1.0; probs[i * numResults + j] = path.logProb; if (!j && dataArgsSize_) { diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index f245620cf668bb341df99cf498105cbd996a6b24..fb3fc5877ac96323e891f800db80af83b6809831 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -189,6 +189,11 @@ public: */ std::vector ids; + /** + * @brief idsProb, log probability of each generated words. + */ + std::vector idsProb; + /** * @brief logProb, current probability of path. */ @@ -228,11 +233,13 @@ public: */ Path(Path& old, int newId, real logProb, int machineId, int topIndex) : ids(old.ids), + idsProb(old.idsProb), logProb(old.logProb + logProb), machineId(machineId), topIndex(topIndex), seqId(old.seqId) { ids.push_back(newId); + idsProb.push_back(logProb); if (!old.probHistory.empty()) { this->probHistory = old.probHistory; // probHistory store current prob, not sum @@ -411,8 +418,9 @@ protected: struct Generator { GeneratorConfig config; - std::vector ids; // store generated sequences - Argument outArg; // final output argument + std::vector ids; // store generated sequences + std::vector idsProb; // log probability of each generated word + Argument outArg; // final output argument }; bool generating_; Generator generator_;