Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into scope-impl

fa4f00d9 · qiaolongfei · 5e8d8e07 · 4db23bba · fa4f00d9 · fa4f00d9
9 changed file
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -296,7 +296,7 @@ function(go_library TARGET_NAME)
    COMMAND rm -rf ${PADDLE_IN_GOPATH}                                                                                                                                         
    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
    # Automatically get all dependencies specified in the source code                                                                                                                                 
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./..
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./...
    # Golang build source code
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"

--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -25,21 +25,24 @@ class Variable {
 public:
  template <typename T>
  const T& Get() const {
-    PADDLE_ASSERT(holder_ != nullptr);
-    PADDLE_ASSERT(std::type_index(typeid(T)) ==
-                  std::type_index(holder_->Type()));
+    PADDLE_ASSERT(IsType<T>());
    return *static_cast<const T*>(holder_->Ptr());
  }

  template <typename T>
  T* GetMutable() {
-    if (holder_ == nullptr ||
-        std::type_index(typeid(T)) != std::type_index(holder_->Type())) {
+    if (!IsType<T>()) {
      holder_.reset(new PlaceholderImpl<T>(new T()));
    }
    return static_cast<T*>(holder_->Ptr());
  }

+  template <typename T>
+  bool IsType() const {
+    return holder_ != nullptr &&
+           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+  }
+
 private:
  struct Placeholder {
    virtual ~Placeholder() {}

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -191,6 +191,11 @@ void Layer::addOutputArgument(int deviceId) {
 void Layer::copyOutputToOtherDevice() {
  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
    SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
                                          HPPL_STREAM_DEFAULT);
    outputOtherDevice_[i].sequenceStartPositions =

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1565,6 +1565,8 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
                    const_cast<real*>(src.getData()),
                    sizeof(real) * elementCnt_,
                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
  } else if (typeid(src) == typeid(CpuMatrix)) {
    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
  } else {

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -239,7 +239,8 @@ public:
    LOG(FATAL) << "Not implemented";
  }

-  // asynchronous copy
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
    LOG(FATAL) << "Not implemented";
  }

--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -657,6 +657,8 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
                    (void*)src.getData(),
                    sizeof(T) * this->getSize(),
                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
  } else {
    src.copyTo(this);
  }

--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -168,11 +168,11 @@ public:
  virtual void copyFrom(const VectorT<T>& src) = 0;

  /**
-   * If use_gpu, this function will push the copy-task to the specifed-stream
-   * and return immediately.
+   * If GpuVector, this function is an asynchronous interface,
+   * will push the copy-task to the specifed-stream and return immediately.
   *
-   * If not use GPU, this function is same as
-   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
+   * If CpuVector, this function is an synchronous interface,
+   * same as the copyFrom(const VectorT<T>& src).
   */
  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;


--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1127,4 +1127,18 @@ TEST(Matrix, MaxOutFwdBwd) {
  }
 }

+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 1000;
+  const size_t width = 1000;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
 #endif
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -31,6 +31,7 @@ Configuring cmake in /paddle/build ...
      -DWITH_DOC=OFF
      -DWITH_GPU=${WITH_GPU:-OFF}
      -DWITH_AVX=${WITH_AVX:-OFF}
+      -DWITH_GOLANG=${WITH_GOLANG:-OFF}
      -DWITH_SWIG_PY=ON
      -DCUDNN_ROOT=/usr/
      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
@@ -43,6 +44,7 @@ cmake .. \
      -DWITH_DOC=OFF \
      -DWITH_GPU=${WITH_GPU:-OFF} \
      -DWITH_AVX=${WITH_AVX:-OFF} \
+      -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
      -DWITH_SWIG_PY=ON \
      -DCUDNN_ROOT=/usr/ \
      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \