use HPPL_STREAM_DEFAULT for layer computation

Change-Id: Id66da7b7f5bf9ec80cc19b347e4fb822a5a6f197

use HPPL_STREAM_DEFAULT for layer computation
Change-Id: Id66da7b7f5bf9ec80cc19b347e4fb822a5a6f197
0f91ea7e · Haonan · Yu Yang · 688eeefa · 0f91ea7e · 0f91ea7e
8 changed file
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -194,8 +194,9 @@ public:
  virtual real evalImp(std::vector<Argument>& arguments) {
    CHECK_EQ(arguments.size(), (size_t)2);
    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false);
-    label.resizeAndCopyFrom(arguments[1], false);
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    CHECK(label.sequenceStartPositions);
    CHECK(label.ids);
    size_t numSequences = label.sequenceStartPositions->getSize() - 1;

--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -878,7 +878,11 @@ void TrainerThread::copyOutputGrad() {
  outArgs_.resize(outputGradArgs.size());
  for (size_t i = 0; i < outputGradArgs.size(); i++) {
    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
-                                  multiMachine_->useGpu());
+                                  multiMachine_->useGpu(),
+                                  HPPL_STREAM_DEFAULT);
+  }
+  if (multiMachine_->useGpu()) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
  gradientMachine_->setOutputGrad(outArgs_);
 }

--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -49,9 +49,10 @@ void CTCLayer::forward(PassType passType) {
  Layer::forward(passType);
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
    }
-    hl_stream_synchronize(HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
  } else {
    forwardImp(getInput(0), getInput(1));
@@ -93,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) {
  if (useGpu_) {
    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
  } else {
    backwardImp(callback, getInput(0), getInput(1));
  }

--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -248,7 +248,7 @@ void ConvOperator::forward() {
  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
  checkFilterSize(ins_[1]->value);
  Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_);
+                         outputH_ * outputW_ * numFilters_, false, useGpu_);
  {
    AsyncGpuBlock block;
    for (size_t batchId = 0; batchId < batchSize; ++batchId) {

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -509,9 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
                               Matrix &cost) {
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
    }
-    hl_stream_synchronize(HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
  forwardImpIn(output, label, cost);
 }

--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -52,9 +52,10 @@ public:
    Layer::forward(passType);
    if (useGpu_) {
      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+        tmpCpuInput_[i].resizeAndCopyFrom(
+            getInput(i), false, HPPL_STREAM_DEFAULT);
      }
-      hl_stream_synchronize(HPPL_STREAM_1);
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
      forwardImp(tmpCpuInput_[0]);
    } else {
      forwardImp(getInput(0));

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -22,8 +22,12 @@ namespace paddle {
 static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
                          hl_stream_t stream) {
  if (src) {
-    Matrix::resizeOrCreate(dest, src->getHeight(),
-                           src->getWidth(), false, useGpu);
+    if (!dest) {
+      dest = src->clone(0, 0, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(src->getHeight(), src->getWidth());
+    }
    dest->copyFrom(*src, stream);
  } else {
    dest.reset();
@@ -59,7 +63,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
    CHECK_LE((size_t)startRow + copySize, src->getHeight());
    int height = copySize;
    int width = src->getWidth();
-    Matrix::resizeOrCreate(dest, height, width, false, useGpu);
+    if (!dest) {
+      dest = src->clone(height, width, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(height, width);
+    }
    MatrixPtr submat = src->subMatrix(startRow, copySize);
    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -203,15 +203,27 @@ struct Argument {
   *   startSeq: the sample id of start
   *   copySize: how many samples need to copy
   *   return value: how many samples are copied
+   * Note that when specifying the stream explicitly in this case,
+   * synchronize should also be called somewhere after this function
   */
  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
                            int32_t copySize, bool useGpu, hl_stream_t stream);

+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
                            int32_t copySize, bool useGpu = FLAGS_use_gpu);

  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);

+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);

  /*