Merge pull request #2618 from hedaoyuan/fix_copyFrom

Change the CpuMatrix::copyFrom and CpuVector::copyFrom with the strea…

Merge pull request #2618 from hedaoyuan/fix_copyFrom
Change the CpuMatrix::copyFrom and CpuVector::copyFrom with the strea…
4db23bba · hedaoyuan · GitHub · 7978f05d · 49e87ee3 · 4db23bba
6 changed file
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -191,6 +191,11 @@ void Layer::addOutputArgument(int deviceId) {
 void Layer::copyOutputToOtherDevice() {
  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
    SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
                                          HPPL_STREAM_DEFAULT);
    outputOtherDevice_[i].sequenceStartPositions =

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1565,6 +1565,8 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
                    const_cast<real*>(src.getData()),
                    sizeof(real) * elementCnt_,
                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
  } else if (typeid(src) == typeid(CpuMatrix)) {
    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
  } else {

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -239,7 +239,8 @@ public:
    LOG(FATAL) << "Not implemented";
  }
-  // asynchronous copy
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
    LOG(FATAL) << "Not implemented";
  }

--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -657,6 +657,8 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
                    (void*)src.getData(),
                    sizeof(T) * this->getSize(),
                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
  } else {
    src.copyTo(this);
  }

--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -168,11 +168,11 @@ public:
  virtual void copyFrom(const VectorT<T>& src) = 0;
  /**
-   * If use_gpu, this function will push the copy-task to the specifed-stream
+   * If GpuVector, this function is an asynchronous interface,
-   * and return immediately.
+   * will push the copy-task to the specifed-stream and return immediately.
   *
-   * If not use GPU, this function is same as
+   * If CpuVector, this function is an synchronous interface,
-   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
+   * same as the copyFrom(const VectorT<T>& src).
   */
  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1127,4 +1127,18 @@ TEST(Matrix, MaxOutFwdBwd) {
  }
 }
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 1000;
+  const size_t width = 1000;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+  TensorCheckEqual(cpu, copy);
+}
 #endif