add implementations.

8f4ca2d1 · caoying03 · a037b099 · 8f4ca2d1 · 8f4ca2d1 · 8f4ca2d1
3 changed file
--- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
@@ -16,6 +16,168 @@ limitations under the License. */
 namespace paddle {
+void CostForOneSequence::calValidExpandStep() {
+  validExpansionCount_ = 0;
+  goldAsExtraPath_ = true;
+  for (size_t i = 0; i < beams_->expansionCount; ++i) {
+    real gold = static_cast<real>(beams_->gold[i]);
+    if (i) {
+      real* start = beams_->candidateIds[i - 1]->getData();
+      goldRowIds_[i] = std::count_if(
+          start,
+          start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1],
+          [](const real& val) { return val != -1.; });
+    } else
+      goldRowIds_[i] = 0;
+    real* start =
+        beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_;
+    real* findEnd = std::find(start, start + beamSize_, gold);
+    validExpansionCount_++;
+    if (start + beamSize_ == findEnd) return;
+    goldColIds_[i] = findEnd - start;
+  }
+  if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false;
+}
+size_t CostForOneSequence::initLastExpansion() {
+  int beamId = validExpansionCount_ - 1;
+  const MatrixPtr candidates = beams_->candidateIds[beamId];
+  size_t height = candidates->getHeight();
+  /* initialization the last expansion. */
+  size_t pathCount = std::count_if(candidates->getData(),
+                                   candidates->getData() + height * beamSize_,
+                                   [](const real& val) { return val != -1; });
+  /*
+   * if the gold sequence falls off the beam during search,
+   * add the gold sequence as the last path into all expanded paths.
+   */
+  if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++;
+  pathRowIdsInEachBeam_.clear();
+  pathRowIdsInEachBeam_.resize(validExpansionCount_,
+                               std::vector<int>(pathCount, 0));
+  parentIdsInBeam_.clear();
+  parentIdsInBeam_.resize(pathCount, 0);
+  if (goldAsExtraPath_) {
+    /* add gold sequence into the total expansion. */
+    pathRowIdsInEachBeam_[beamId].back() =
+        beams_->gold[beamId] +
+        getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]);
+    parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1];
+  } else {
+    size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId];
+    goldIdsInFinalExpansion_ =
+        std::count_if(candidates->getData(),
+                      candidates->getData() + goldOffset,
+                      [](const real& val) { return val != -1.; });
+  }
+  /*
+   * TODO(caoying): fix this, store the indices of selected candidate
+   * paths into Argument.ids
+   */
+  real* ids = candidates->getData();
+  size_t curIdx = 0;
+  for (size_t i = 0; i < height; ++i) {
+    int basePos = getSeqStartPos(beamId, i);
+    for (size_t j = 0; j < beamSize_; ++j) {
+      int id = ids[i * beamSize_ + j];
+      if (id == -1) continue;
+      pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos;
+      parentIdsInBeam_[curIdx++] = i;
+    }
+  }
+  return pathCount;
+}
+void CostForOneSequence::constructTotalExpansion() {
+  /*
+   * construct the entire expanded beam by begining with the last search
+   * in which gold falls off the beam.
+   */
+  size_t totalPathCount = initLastExpansion();
+  for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) {
+    const MatrixPtr candidates = beams_->candidateIds[beamId];
+    real* ids = candidates->getData();
+    int lastParentIdInBeam = -1;
+    int basePos = -1;
+    for (size_t i = 0;
+         i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount);
+         ++i) {
+      int id = ids[parentIdsInBeam_[i]];
+      int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot;
+      if (parentIdsInBeam_[i] != lastParentIdInBeam)
+        basePos = getSeqStartPos(beamId, parentRowId);
+      pathRowIdsInEachBeam_[beamId][i] = id + basePos;
+      lastParentIdInBeam = parentIdsInBeam_[i];
+      parentIdsInBeam_[i] = parentRowId;
+      if (goldAsExtraPath_)
+        pathRowIdsInEachBeam_[beamId][totalPathCount - 1] =
+            beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]);
+    }
+  }
+}
+real CostForOneSequence::globallyNormalizedScore() {
+  expandedPathScores_.resize(validExpansionCount_);
+  Matrix::resizeOrCreate(
+      softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false);
+  softmaxOut_->zero();
+  MatrixPtr tmp = Matrix::create(
+      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
+  for (size_t i = 0; i < validExpansionCount_; ++i) {
+    Matrix::resizeOrCreate(expandedPathScores_[i],
+                           pathRowIdsInEachBeam_[i].size(),
+                           1,
+                           false,
+                           false);
+    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
+                                        pathRowIdsInEachBeam_[i].size(),
+                                        false);
+    expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds);
+    tmp->add(*expandedPathScores_[i]);
+  }
+  softmaxOut_->softmax(*softmaxOut_);
+  return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]);
+}
+real CostForOneSequence::forward() {
+  calValidExpandStep();
+  constructTotalExpansion();
+  return globallyNormalizedScore();
+}
+void CostForOneSequence::backward() {
+  softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
+  MatrixPtr tmp = Matrix::create(
+      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
+  for (size_t i = 0; i < validExpansionCount_; ++i) {
+    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
+                                        pathRowIdsInEachBeam_[i].size(),
+                                        false);
+    /*
+      beams_->scoreGrad[i] has been intialized outside this class, this
+      class only keeps a pointer pointing to the original input gradients,
+      so here does not need to allocate or initalize the memory.
+    */
+    tmp->addToRows(*beams_->scoreGrad[i], *rowIds);
+  }
+}
 REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam);
 bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
@@ -24,13 +186,189 @@ bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
  Layer::init(layerMap, parameterMap);
  CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number.";
-  setNeedSequenceInfo(false);
+  beamExpanCount_ = inputLayers_.size() / 3;
+  candidateScores_.resize(beamExpanCount_);
+  candidateScoreGrad_.resize(beamExpanCount_);
+  candidateInBeam_.resize(beamExpanCount_);
+  goldSequence_.resize(beamExpanCount_);
+  gradToInputs_.resize(beamExpanCount_);
+  setNeedSequenceInfo(false);
  return true;
 }
-void CrossEntropyOverBeam::forward(PassType passType) {}
+void CrossEntropyOverBeam::checkInputs() {
+  batchSize_ = 0;
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    const Argument& scores = getInput(i * 3);
+    const Argument& selCandidates = getInput(i * 3 + 1);
+    const Argument& goldSeq = getInput(i * 3 + 2);
+    if (i) {
+      CHECK(scores.hasSubseq()) << "Beam expansion expect the first one, "
+                                   "should be a nested sequence";
+      CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
+      CHECK_EQ(scores.getNumSequences(), batchSize_);
+      CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
+    } else {
+      CHECK(scores.hasSeq()) << "The first beam expansion should be a sequence";
+      batchSize_ = scores.getNumSequences();
+      beamSize_ = getInputValue(i * 3 + 1)->getWidth();
+      CHECK_EQ(batchSize_, selCandidates.getBatchSize());
+    }
+    CHECK_EQ(1U, scores.value->getWidth());
+    CHECK_EQ(batchSize_, goldSeq.getBatchSize());
+  }
+}
+void CrossEntropyOverBeam::copyInputsToCpu() {
+  auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) {
+    if (dynamic_cast<GpuMatrix*>(src.get())) {
+      Matrix::resizeOrCreate(
+          trg, src->getHeight(), src->getWidth(), false, false);
+      trg->copyFrom(*src);
+    } else {
+      trg = std::move(src);
+    }
+  };
+  auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) {
+    if (dynamic_cast<GpuIVector*>(src.get())) {
+      IVector::resizeOrCreate(trg, src->getSize(), false);
+      trg->copyFrom(*src);
+    } else {
+      trg = std::move(src);
+    }
+  };
+  beamSplitPos_.clear();
+  beamSplitPos_.resize(batchSize_, std::vector<int>(beamExpanCount_, 0));
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    copyValue(getInputValue(i * 3), candidateScores_[i]);
+    copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]);
+    copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]);
+    if (i) {
+      ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions;
+      const int* seqStarts = seqInfo->getMutableData(false);
+      ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions;
+      const int* subSeqStarts = subSeqInfo->getMutableData(false);
+      size_t seqId = 1;
+      for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1;
+           ++subSeqId) {
+        CHECK_LT(seqId, seqInfo->getSize());
+        if (subSeqStarts[subSeqId] == seqStarts[seqId]) {
+          beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i];
+          seqId++;
+        }
+        beamSplitPos_[seqId - 1][i]++;
+      }
+    } else {
+      for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1;
+    }
+  }
+}
+void CrossEntropyOverBeam::splitBatchBeams() {
+  beamCosts_.resize(batchSize_);
+  beamPerSeq_.resize(batchSize_, beamExpanCount_);
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    int* seqStarts =
+        getInput(i * 3).sequenceStartPositions->getMutableData(false);
+    int* subSeqStarts = nullptr;
+    int maxLen = 0;
+    if (i) {
+      subSeqStarts =
+          getInput(i * 3).subSequenceStartPositions->getMutableData(false);
+      maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1;
+    } else
+      maxLen = getInput(i).sequenceStartPositions->getSize() - 1;
+    for (size_t j = 0; j < batchSize_; ++j) {
+      beamPerSeq_[j].scores[i] =
+          Matrix::create(candidateScores_[i]->getData() + seqStarts[j],
+                         seqStarts[j + 1] - seqStarts[j],
+                         1,
+                         false,
+                         false);
+      beamPerSeq_[j].scoreGrad[i] =
+          Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j],
+                         seqStarts[j + 1] - seqStarts[j],
+                         1,
+                         false,
+                         false);
+      int offset = j ? beamSplitPos_[j - 1][i] : 0;
+      int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0);
+      CHECK_GE(maxLen, offset + height);
+      beamPerSeq_[j].seqInfo[i] = IVector::create(
+          (i ? subSeqStarts : seqStarts) + offset, height + 1, false);
-void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {}
+      beamPerSeq_[j].candidateIds[i] =
+          Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_,
+                         height,
+                         beamSize_,
+                         false,
+                         false);
+      beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j];
+    }
+  }
+}
+void CrossEntropyOverBeam::resizeOutput() {
+  Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false);
+  output_.value->zero();
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    MatrixPtr inGrad = getInputGrad(i * 3);
+    if (dynamic_cast<GpuMatrix*>(inGrad.get())) {
+      Matrix::resizeOrCreate(candidateScoreGrad_[i],
+                             inGrad->getHeight(),
+                             inGrad->getWidth(),
+                             false,
+                             false);
+    } else
+      candidateScoreGrad_[i] = std::move(inGrad);
+    candidateScoreGrad_[i]->zero();
+  }
+}
+void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) {
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    if (dynamic_cast<GpuMatrix*>(getInputGrad(i * 3).get()))
+      getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]);
+    if (i == copyCount - 1) break;
+  }
+}
+void CrossEntropyOverBeam::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+  copyInputsToCpu();
+  resizeOutput();
+  splitBatchBeams();
+  MatrixPtr outputValue = getOutputValue();
+  for (size_t i = 0; i < batchSize_; ++i) {
+    beamCosts_[i].setData(
+        std::move(std::make_shared<BeamExpansion>(beamPerSeq_[i])), beamSize_);
+    outputValue->getData()[i] = beamCosts_[i].forward();
+  }
+}
+void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < batchSize_; ++i) {
+    beamCosts_[i].backward();
+    copyGradToGpu(beamCosts_[i].getValidExpansionCount());
+  }
+}
 }  // namespace paddle
--- a/paddle/gserver/layers/CrossEntropyOverBeam.h
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
@@ -19,6 +19,79 @@ limitations under the License. */
 namespace paddle {
+struct BeamExpansion {
+  // store the entire beam expansion for a single sequence
+  std::vector<MatrixPtr> scores;
+  std::vector<IVectorPtr> seqInfo;
+  std::vector<MatrixPtr> candidateIds;
+  std::vector<int> gold;
+  std::vector<MatrixPtr> scoreGrad;
+  size_t expansionCount;
+  BeamExpansion(int n) {
+    expansionCount = n;
+    scores.resize(expansionCount);
+    seqInfo.resize(expansionCount);
+    candidateIds.resize(expansionCount);
+    scoreGrad.resize(expansionCount);
+    gold.resize(expansionCount);
+  };
+};
+typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
+class CostForOneSequence {
+public:
+  CostForOneSequence()
+      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
+  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
+    beams_ = bPtr;
+    beamSize_ = beamSize;
+    expandedPathScores_.clear();
+    expandedPathScores_.resize(beams_->expansionCount);
+    goldRowIds_.clear();
+    goldRowIds_.resize(beams_->expansionCount, 0);
+    goldColIds_.clear();
+    goldColIds_.resize(beams_->expansionCount, -1);
+  }
+  size_t getValidExpansionCount() { return validExpansionCount_; }
+  real forward();
+  void backward();
+private:
+  void calValidExpandStep();
+  void constructTotalExpansion();
+  size_t initLastExpansion();
+  real globallyNormalizedScore();
+  int getSeqStartPos(size_t beamId, size_t rowId) {
+    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
+    int* starts = beams_->seqInfo[beamId]->getData();
+    return starts[rowId] - starts[0];
+  };
+  size_t beamSize_;
+  size_t validExpansionCount_;
+  bool goldAsExtraPath_;
+  std::vector<int> goldRowIds_;
+  std::vector<int> goldColIds_;
+  BeamExpansionPtr beams_;
+  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
+  std::vector<int> parentIdsInBeam_;
+  size_t goldIdsInFinalExpansion_;
+  std::vector<MatrixPtr> expandedPathScores_;
+  MatrixPtr softmaxOut_;
+};
 class CrossEntropyOverBeam : public Layer {
 public:
  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
@@ -26,6 +99,31 @@ public:
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback) override;
+private:
+  void checkInputs();
+  void copyInputsToCpu();
+  void resizeOutput();
+  void copyGradToGpu(size_t copyCount);
+  void splitBatchBeams();
+  size_t beamExpanCount_;
+  size_t batchSize_;
+  size_t beamSize_;
+  // Currently, this layer only works on CPU, if its inputs is on GPU,
+  // copy them to CPU memory.
+  std::vector<MatrixPtr> candidateScores_;
+  std::vector<MatrixPtr> candidateScoreGrad_;
+  std::vector<MatrixPtr> candidateInBeam_;
+  std::vector<MatrixPtr> gradToInputs_;
+  std::vector<IVectorPtr> goldSequence_;
+  std::vector<std::vector<int>> beamSplitPos_;
+  // split entire bath of beams into beam per sequnence.
+  std::vector<BeamExpansion> beamPerSeq_;
+  // beamCosts_ is used to propagate error in one sequence.
+  std::vector<CostForOneSequence> beamCosts_;
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -28,9 +28,17 @@ using namespace paddle;  // NOLINT
 DECLARE_int32(gpu_id);
 DECLARE_bool(thread_local_rand_use_global_seed);
-const size_t MAX_SEQ_NUM = 10;
+// const size_t MAX_SEQ_NUM = 5;
-const size_t MAX_SEQ_LEN = 27;
+// const size_t MAX_SEQ_LEN = 10;
-const size_t MAX_BEAM_SIZE = 10;
+// const size_t MAX_BEAM_SIZE = 3;
+const size_t MAX_SEQ_NUM = 23;
+const size_t MAX_SEQ_LEN = 50;
+const size_t MAX_BEAM_SIZE = 27;
+// const size_t SEED = 1503391792;
+// const size_t SEED = 1;
+const size_t SEED = (size_t)(time(NULL));
 struct SingleBeamExpansion {
  vector<int> seqStartPos;
@@ -43,11 +51,30 @@ struct SingleBeamExpansion {
  vector<int> groundTruth;
  vector<size_t> inBeam;
  vector<int> rowIdxInBeam;
+  vector<int> colIdxInBeam;
+  void resetGroundTruth(size_t n) {
+    groundTruth.clear();
+    groundTruth.resize(n, -1);
+    inBeam.clear();
+    inBeam.resize(n, 0);
+    rowIdxInBeam.clear();
+    rowIdxInBeam.resize(n, -1);
+    colIdxInBeam.clear();
+    colIdxInBeam.resize(n, -1);
+  }
 };
+inline float randFloat() {
+  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+}
 void genRand(real* numbers, size_t n) {
  default_random_engine generator;
-  uniform_real_distribution<double> distribution(0.0, 1.0);
+  uniform_real_distribution<real> distribution(0.0, 1.0);
  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
 }
@@ -72,8 +99,7 @@ void genCandidateScores(bool hasSubseq,
  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
  subSeqStartPos.resize(1, 0);
-  srand((size_t)(time(NULL)));
+  srand(SEED);
-  // srand(1);
  if (prevBeam.selectedIndices.size()) {
    if (prevBeam.subSeqStartPos.size() > 1) {
      int seqIdx = 1;
@@ -81,9 +107,8 @@ void genCandidateScores(bool hasSubseq,
      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
        for (size_t j = 0; j < beamSize; ++j) {
          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
-          for (size_t k = 0; k < beamSize; ++k)
+          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
-            subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
+                                   subSeqStartPos.back());
-                                     subSeqStartPos.back());
        }
        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
          seqStartPos.push_back(subSeqStartPos.back());
@@ -91,7 +116,6 @@ void genCandidateScores(bool hasSubseq,
        }
      }
    } else {
-      // samples in previous beam are sequences.
      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
        if (i && i % beamSize == 0) {
          seqStartPos.push_back(subSeqStartPos.back());
@@ -141,27 +165,41 @@ void genSelectedIndices(size_t beamSize,
 void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
                    size_t beamSize) {
-  size_t seqNum = beamExpansions[1].seqStartPos.size() - 1;
+  SingleBeamExpansion& beam = beamExpansions[1];
+  size_t seqNum = beam.seqStartPos.size() - 1;
  for (size_t i = 2; i < beamExpansions.size(); ++i)
-    CHECK_EQ(seqNum, beamExpansions[i - 1].seqStartPos.size() - 1);
+    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
-  // srand(1);
+  srand(SEED);
-  srand((size_t)(time(NULL)));
  // initialize the first beam.
-  SingleBeamExpansion& beam = beamExpansions[1];
+  beam.resetGroundTruth(seqNum);
-  beam.groundTruth.resize(seqNum, 0);
-  beam.inBeam.resize(seqNum, 0);
-  beam.rowIdxInBeam.resize(seqNum, -1);
-  auto begPos = beam.selectedIndices.begin();
  for (size_t i = 0; i < seqNum; ++i) {
-    int seqLen = beam.seqStartPos[i + 1] - beam.seqStartPos[i];
+    if (randFloat() > 0.5) {
-    int label = rand() % seqLen;
+      // force the randomly generated label falls in the beam by chance 0.5.
-    auto endPos = begPos + beamSize;
+      // otherwise, when sequence length is relatively long and beam size is
-    beam.groundTruth[i] = label;
+      // relatively small, the gold sequences falls off the beam at in
-    if (find(begPos, endPos, real(label)) != endPos) beam.inBeam[i] = 1;
+      // the first search.
-    begPos = endPos;
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      beam.colIdxInBeam[i] =
+          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
+            return val != -1.;
+          });
+      beam.groundTruth[i] =
+          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
+      beam.inBeam[i] = 1;
+    } else {
+      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
+      beam.groundTruth[i] = label;
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      real* endPos = begPos + beamSize;
+      real* lblPos = find(begPos, endPos, real(label));
+      if (lblPos != endPos) {
+        beam.inBeam[i] = 1;
+        beam.colIdxInBeam[i] = lblPos - begPos;
+      }
+    }
    beam.rowIdxInBeam[i] = i;
  }
@@ -169,22 +207,33 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
  for (size_t i = 2; i < beamExpansions.size(); ++i) {
    SingleBeamExpansion& curBeam = beamExpansions[i];
    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
+    curBeam.resetGroundTruth(seqNum);
-    curBeam.groundTruth.resize(seqNum, 0);
-    curBeam.inBeam.resize(seqNum, 0);
-    curBeam.rowIdxInBeam.resize(seqNum, -1);
    // iterate over each sequence
    for (size_t j = 0; j < seqNum; ++j) {
-      if (prevBeam.inBeam[j]) {
+      if (!prevBeam.inBeam[j]) continue;
-        // gold sequence falls in the beam in previous search.
+      // gold sequence falls in the beam in previous search.
-        auto begPos = prevBeam.selectedIndices.begin();
+      real* begPos = prevBeam.selectedIndices.data();
-        auto endPos = begPos + prevBeam.rowIdxInBeam[j] * beamSize;
+      int offset =
-        size_t totalExpansion =
+          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
-            prevBeam.rowIdxInBeam[j] * beamSize - count(begPos, endPos, -1.);
+      curBeam.rowIdxInBeam[j] = count_if(
-        curBeam.rowIdxInBeam[j] = totalExpansion + prevBeam.groundTruth[j];
+          begPos, begPos + offset, [](const real& val) { return val != -1.; });
+      if (randFloat() > 0.5) {
+        // force the randomly generated label falls in the beam by chance 0.5.
+        // otherwise, when sequence length is relatively long and beam size is
+        // relatively small, the gold sequences falls off the beam at in
+        // the first search.
+        real* start =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
+                  return val != -1.;
+                });
+        curBeam.colIdxInBeam[j] = n;
+        curBeam.groundTruth[j] = *(start + n);
+        curBeam.inBeam[j] = 1;
+      } else {
        CHECK_LE(curBeam.rowIdxInBeam[j] + 1,
                 curBeam.subSeqStartPos.size() - 1);
        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
@@ -193,16 +242,14 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
        int label = rand() % (end - start);
        curBeam.groundTruth[j] = label;
-        auto findBeg = curBeam.selectedIndices.begin() +
+        real* findBeg =
-                       curBeam.rowIdxInBeam[j] * beamSize;
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        auto findEnd = findBeg + beamSize;
+        real* lblPos =
-        if (find(findBeg, findEnd, real(label)) != findEnd)
+            find(findBeg, findBeg + beamSize, static_cast<real>(label));
+        if (lblPos != (findBeg + beamSize)) {
          curBeam.inBeam[j] = 1;
-      } else {
+          curBeam.colIdxInBeam[j] = lblPos - findBeg;
-        // in previous search, gold sequence has fallen off the beam,
+        }
-        // the beam search stops, here use -1 as a dummy label.
-        // It will not used in calculation the cost.
-        beamExpansions[i].groundTruth[j] = -1;
      }
    }
  }
@@ -230,15 +277,12 @@ void genRandomBeamExpansion(size_t expansionCount,
  genGroundTruth(beamExpansions, beamSize);
 }
-void testCrossEntropyOverBeam(bool useGpu) {
+void testCrossEntropyOverBeam(bool useGpu,
+                              size_t beamSize,
+                              vector<SingleBeamExpansion>& beams) {
  TestConfig config;
  config.layerConfig.set_type("cross_entropy_over_beam");
-  const size_t expansionCount = 3;
-  const size_t beamSize = MAX_BEAM_SIZE;
-  vector<SingleBeamExpansion> beams;
-  genRandomBeamExpansion(expansionCount, beamSize, beams);
  size_t seqNum = 0;
  for (size_t i = 1; i < beams.size(); ++i) {
    const SingleBeamExpansion& beam = beams[i];
@@ -291,7 +335,17 @@ void testCrossEntropyOverBeam(bool useGpu) {
 }
 TEST(Layer, CrossEntropyOverBeam) {
-  for (bool useGpu : {false, true}) testCrossEntropyOverBeam(useGpu);
+  LOG(INFO) << "SEED = " << SEED;
+  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
+  LOG(INFO) << "beamSize = " << beamSize;
+  // TODO(caoying): test with more beam expansions.
+  const size_t expansionCount = 3;
+  vector<SingleBeamExpansion> beams;
+  genRandomBeamExpansion(expansionCount, beamSize, beams);
+  for (bool useGpu : {false, true})
+    testCrossEntropyOverBeam(useGpu, beamSize, beams);
 }
 int main(int argc, char** argv) {
@@ -299,7 +353,7 @@ int main(int argc, char** argv) {
  hl_start();
  hl_init(FLAGS_gpu_id);
  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
+  srand(SEED);
  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }