fixed issues with synchronizing streams when copy from gpu to cpu

* by default, synchronize default_stream after resizeAndCopyFrom * add sync in some places after resizeAndCopyFrom using other streams

fixed issues with synchronizing streams when copy from gpu to cpu
* by default, synchronize default_stream after resizeAndCopyFrom * add sync in some places after resizeAndCopyFrom using other streams
688eeefa · Haonan · Yu Yang · 42a11791 · 688eeefa · 688eeefa
11 changed file
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -196,7 +196,6 @@ public:
    Argument output, label;
    output.resizeAndCopyFrom(arguments[0], false);
    label.resizeAndCopyFrom(arguments[1], false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    CHECK(label.sequenceStartPositions);
    CHECK(label.ids);
    size_t numSequences = label.sequenceStartPositions->getSize() - 1;

--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -878,11 +878,7 @@ void TrainerThread::copyOutputGrad() {
  outArgs_.resize(outputGradArgs.size());
  for (size_t i = 0; i < outputGradArgs.size(); i++) {
    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
-                                  multiMachine_->useGpu(),
+                                  multiMachine_->useGpu());
-                                  HPPL_STREAM_DEFAULT);
-  }
-  if (multiMachine_->useGpu()) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
  gradientMachine_->setOutputGrad(outArgs_);
 }

--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -51,6 +51,7 @@ void CTCLayer::forward(PassType passType) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
    }
+    hl_stream_synchronize(HPPL_STREAM_1);
    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
  } else {
    forwardImp(getInput(0), getInput(1));

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -511,6 +511,7 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
    for (size_t i = 0; i < inputLayers_.size(); i++) {
      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
    }
+    hl_stream_synchronize(HPPL_STREAM_1);
  }
  forwardImpIn(output, label, cost);
 }

--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -54,6 +54,7 @@ public:
      for (size_t i = 0; i < inputLayers_.size(); i++) {
        tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
      }
+      hl_stream_synchronize(HPPL_STREAM_1);
      forwardImp(tmpCpuInput_[0]);
    } else {
      forwardImp(getInput(0));

--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
    testLayer->forward(PASS_TEST);
    Argument out;
    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    if (batchOut.value) {
      size_t dim = batchOut.value->getWidth();
      ASSERT_TRUE((bool)out.value);
@@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
    testLayer->forward(PASS_TEST);
    Argument out;
    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    if (batchOut.value) {
      size_t dim = batchOut.value->getWidth();
      ASSERT_TRUE((bool)out.value);

--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
  Argument& cpuInput = testCpu.dataLayer_->getOutput();
  Argument& gpuInput = testGpu.dataLayer_->getOutput();
  gpuInput.resizeAndCopyFrom(cpuInput, true);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
  if (!matrix) {
    matrix = Matrix::create(height, width, trans, useGpu);
  } else {
+    CHECK_EQ(matrix->useGpu(), useGpu);
    matrix->resize(height, width);
  }
 }
@@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
  } else {
    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    CHECK_EQ(matrix->useGpu(), useGpu);
    matrix->resize(height, width, nnz, valueType, format);
  }
 }

--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
  } else if ((!useGpu) && (!cpuVectorT_)) {
    cpuVectorT_ = VectorT<T>::create(size, false);
  } else {
+    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
    this->resize(size, useGpu);
  }
 }

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -22,11 +22,8 @@ namespace paddle {
 static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
                          hl_stream_t stream) {
  if (src) {
-    if (!dest) {
+    Matrix::resizeOrCreate(dest, src->getHeight(),
-      dest = src->clone(0, 0, useGpu);
+                           src->getWidth(), false, useGpu);
-    } else {
-      dest->resize(src->getHeight(), src->getWidth());
-    }
    dest->copyFrom(*src, stream);
  } else {
    dest.reset();
@@ -60,14 +57,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
  if (src) {
    CHECK_LE((size_t)startRow + copySize, src->getHeight());
    int height = copySize;
    int width = src->getWidth();
-    if (!dest) {
+    Matrix::resizeOrCreate(dest, height, width, false, useGpu);
-      dest = src->clone(height, width, useGpu);
-    } else {
-      dest->resize(height, width);
-    }
    MatrixPtr submat = src->subMatrix(startRow, copySize);
    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
@@ -182,6 +174,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
  }
 }
+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
+   resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+}
 void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
                                 hl_stream_t stream) {
  dataId = src.dataId;
@@ -199,6 +196,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
  resizeAndCopy(strs, src.strs, useGpu, stream);
 }
+int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                                    int32_t copySize, bool useGpu) {
+    int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
+                                     HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    return size;
+}
 int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
                                    int32_t copySize, bool useGpu,
                                    hl_stream_t stream) {

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -205,11 +205,14 @@ struct Argument {
   *   return value: how many samples are copied
   */
  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu = FLAGS_use_gpu,
+                            int32_t copySize, bool useGpu, hl_stream_t stream);
-                            hl_stream_t stream = HPPL_STREAM_DEFAULT);
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu,
+  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                         hl_stream_t stream = HPPL_STREAM_DEFAULT);
+                            int32_t copySize, bool useGpu = FLAGS_use_gpu);
+  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
  /*
    @brief Concatenate several arguments into one and put the result into it.