diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 35e69dcb20411b77d3b24edf0e9d96bf8cbf1aa2..ef1bc07c2dbe71268c706a119056d3a9fcfc7f8c 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -20,6 +20,7 @@ endif() cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) +nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index d0ab640485baf6d76ee629ea420b603f42b031b4..be2b301619639106ac7b578e5a79cf33f4379e48 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -48,12 +48,26 @@ namespace framework { */ struct LoD : public std::vector> { using std::vector>::vector; + platform::Place place() const { + if (this->size() == 0) { + // Not Initialze Yet. + return platform::CPUPlace(); + } else { + return this->front().place(); + } + } void CopyFromCUDA() { for (auto it = this->begin(); it != this->end(); ++it) { it->CopyFromCUDA(); } } + + void CopyToPeer(platform::Place place) { + for (auto it = this->begin(); it != this->end(); ++it) { + it->CopyToPeer(place); + } + } }; std::ostream& operator<<(std::ostream& os, const LoD& lod); diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu index d4c9f00bd9c00f3cae68858ca46c5320fc117405..adea02e3b3fdcf4873de76ff91116f43ac9fe259 100644 --- a/paddle/framework/lod_tensor_test.cu +++ b/paddle/framework/lod_tensor_test.cu @@ -28,28 +28,6 @@ __global__ void test(size_t* a, int size) { } } -TEST(Vector, Normal) { - using namespace paddle::framework; - using namespace paddle::platform; - using namespace paddle::memory; - - paddle::framework::InitDevices(); - - paddle::framework::Vector vec({1, 2, 3}); - size_t* ptr = vec.data(); - for (size_t i = 0; i < vec.size(); ++i) { - EXPECT_EQ(vec[i], *(ptr + i)); - } - - vec.clear(); - vec.CopyFromCUDA(); - - std::vector v = {1, 2, 3}; - for (size_t i = 0; i < v.size(); ++i) { - EXPECT_EQ(v[i], vec[i]); - } -} - TEST(LoD, data) { paddle::framework::InitDevices(); diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h index 422fbbac488abff846d0d79e393ecef5400de9d2..5202775515d335ff81bb17e6ce21338c40041ca3 100644 --- a/paddle/framework/mixed_vector.h +++ b/paddle/framework/mixed_vector.h @@ -40,20 +40,21 @@ class Vector : public std::vector { Vector() {} Vector(const std::vector &v) : std::vector(v) {} // NOLINT - virtual ~Vector() { -#ifdef PADDLE_WITH_CUDA - if (cuda_ptr_ != nullptr) { - memory::Free(place_, cuda_ptr_); - } -#endif - } + inline platform::Place place() const { return place_; } + /*! Return a pointer to constant memory block. */ + inline const T *data(platform::Place place) const; + + /*! Return a pointer to mutable memory block. */ + inline T *mutable_data(platform::Place place); + + // TODO(dzhwinter): below interfaces should be removed /* Get device vector */ T *cuda_data() { CopyToCUDA(); PADDLE_ENFORCE_NOT_NULL( cuda_ptr_, "No data or Insufficient CUDA memory to allocation"); - return static_cast(cuda_ptr_); + return static_cast(cuda_ptr_.get()); } /* Get host vector */ @@ -76,25 +77,73 @@ class Vector : public std::vector { void CopyToPeer(platform::Place); private: - void *cuda_ptr_ = nullptr; + std::shared_ptr cuda_ptr_; size_t cuda_size_ = 0; // device vector numel platform::CUDAPlace place_; }; template -void Vector::CopyToCUDA() { +inline const T *Vector::data(platform::Place place) const { + if (platform::is_cpu_place(place)) { + return std::vector::data(); + } else if (platform::is_gpu_place(place)) { + if (cuda_ptr_ == nullptr) { + return nullptr; + } + if (boost::get(place) == place_) { + return static_cast(cuda_ptr_.get()); + } else { + PADDLE_THROW( + "Unmatched place. Please use `mutable_data` copy lod to the target " + "Place first."); + } + } else { + PADDLE_THROW("Unsupport Place."); + } +} + +template +inline T *Vector::mutable_data(platform::Place place) { + if (platform::is_cpu_place(place)) { + return std::vector::data(); + } else if (platform::is_gpu_place(place)) { + if (boost::get(place) != place_) { + place_ = boost::get(place); + } #ifdef PADDLE_WITH_CUDA - if (cuda_size_ < this->size()) { - if (cuda_ptr_ != nullptr) { - memory::Free(place_, cuda_ptr_); + if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) { + cuda_ptr_.reset( + memory::Alloc(place_, this->size() * sizeof(T)), + memory::PlainDeleter(place_)); } - cuda_ptr_ = - memory::Alloc(place_, this->size() * sizeof(T)); + cuda_size_ = this->size(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *ctx = pool.GetByPlace(place_); + memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(), + static_cast(this->data()), + this->size() * sizeof(T), ctx->stream()); + ctx->Wait(); + return static_cast(cuda_ptr_.get()); +#else + return nullptr; +#endif + } else { + PADDLE_THROW("Unsupport Place."); + } +} + +template +void Vector::CopyToCUDA() { +#ifdef PADDLE_WITH_CUDA + if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) { + cuda_ptr_.reset( + memory::Alloc(place_, this->size() * sizeof(T)), + memory::PlainDeleter(place_)); } cuda_size_ = this->size(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *ctx = pool.GetByPlace(place_); - memory::Copy(place_, cuda_ptr_, platform::CPUPlace(), + memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(), static_cast(this->data()), this->size() * sizeof(T), ctx->stream()); ctx->Wait(); @@ -112,32 +161,32 @@ void Vector::CopyFromCUDA() { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *ctx = pool.GetByPlace(place_); memory::Copy(platform::CPUPlace(), static_cast(this->data()), place_, - static_cast(cuda_ptr_), this->size() * sizeof(T), - ctx->stream()); + static_cast(cuda_ptr_.get()), + this->size() * sizeof(T), ctx->stream()); ctx->Wait(); #endif } template -void Vector::CopyToPeer(platform::Place peer_place) { +void Vector::CopyToPeer(platform::Place place) { #ifdef PADDLE_WITH_CUDA - auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_); - void *peer_cuda_ptr = memory::Alloc( - boost::get(peer_place), this->size() * sizeof(T)); - memory::Copy(boost::get(peer_place), peer_cuda_ptr, - place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream()); + if (boost::get(place) != place_) { + place_ = boost::get(place); + } + if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) { + cuda_ptr_.reset( + memory::Alloc(place_, this->size() * sizeof(T)), + memory::PlainDeleter(place_)); + } + cuda_size_ = this->size(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *ctx = pool.GetByPlace(place_); + memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(), + static_cast(this->data()), + this->size() * sizeof(T), ctx->stream()); ctx->Wait(); - - memory::Free(place_, cuda_ptr_); - place_ = boost::get(peer_place); - cuda_ptr_ = peer_cuda_ptr; #endif } -template class Vector; -template class Vector; -template class Vector; -template class Vector; - } // namespace framework } // namespace paddle diff --git a/paddle/framework/mixed_vector_test.cu b/paddle/framework/mixed_vector_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..7b571788ad1ade50e05dc9a70cba35b83f8db3ea --- /dev/null +++ b/paddle/framework/mixed_vector_test.cu @@ -0,0 +1,72 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include +#include "gtest/gtest.h" + +#include "paddle/framework/init.h" +#include "paddle/framework/mixed_vector.h" + +using namespace paddle::framework; +using namespace paddle::platform; +using namespace paddle::memory; + +template +__global__ void test(T* data, int size) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; + i += blockDim.x * gridDim.x) { + data[i] *= 2; + } +} + +TEST(Vector, Normal) { + // fill the device context pool. + InitDevices(); + + Vector vec({1, 2, 3}); + size_t* ptr = vec.data(); + for (size_t i = 0; i < vec.size(); ++i) { + EXPECT_EQ(vec[i], *(ptr + i)); + } + + vec.clear(); + vec.CopyFromCUDA(); + + std::vector v = {1, 2, 3}; + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], vec[i]); + } +} + +TEST(Vector, MultipleCopy) { + InitDevices(); + Vector vec({1, 2, 3}); + CUDAPlace place(0); + vec.mutable_data(place); + auto vec2 = Vector(vec); + { + const size_t* ptr = vec2.data(CPUPlace()); + for (size_t i = 0; i < vec2.size(); ++i) { + EXPECT_EQ(*(ptr + i), vec[i]); + } + } + test<<<3, 3>>>(vec2.mutable_data(place), vec2.size()); + vec2.CopyFromCUDA(); + { + const size_t* ptr = vec2.data(CPUPlace()); + for (size_t i = 0; i < vec2.size(); ++i) { + EXPECT_EQ(*(ptr + i), vec[i] * 2); + } + } +} diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 7012b6d331d0c4631a3d120fbaf3db7c97298ac7..30ed68c6e0ea95d206658d16684800e169ededc5 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -81,5 +81,23 @@ class PODDeleter { Place place_; }; +/** + * \brief Free memory block in one place does not meet POD + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template +class PlainDeleter { + public: + explicit PlainDeleter(Place place) : place_(place) {} + void operator()(T* ptr) { Free(place_, reinterpret_cast(ptr)); } + + private: + Place place_; +}; + } // namespace memory } // namespace paddle diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc index dfff6f0888a5258dfbd68d34ef83ca8306aeed2a..89045923f9ff2f33bc112b199c493047440e15c4 100644 --- a/paddle/operators/parallel_do_op.cc +++ b/paddle/operators/parallel_do_op.cc @@ -76,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src, if (src.IsType()) { if (src.Get().place() == dst_place) { dst->GetMutable()->ShareDataWith(src.Get()); + dst->GetMutable()->set_lod(src.Get().lod()); } else { Copy(src.Get(), dst_place, dst->GetMutable()); + framework::LoD lod(src.Get().lod()); + lod.CopyToPeer(dst_place); + dst->GetMutable()->set_lod(lod); } } else if (src.IsType()) { auto &src_sr = src.Get(); auto *dst_sr = dst->GetMutable(); - dst_sr->set_rows(src_sr.rows()); dst_sr->set_height(src_sr.height()); if (src_sr.value().place() == dst_place) { dst_sr->mutable_value()->ShareDataWith(src_sr.value()); + dst_sr->set_rows(src_sr.rows()); } else { Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); + framework::Vector lod(src_sr.rows()); + lod.CopyToPeer(dst_place); + dst_sr->set_rows(lod); } } else { PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); @@ -145,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase { auto *sub_scope = sub_scopes[i]; auto *dst = sub_scope->Var(param)->GetMutable(); framework::Copy(src, place, dst); + framework::LoD lod(src.lod()); + lod.CopyToPeer(place); + dst->set_lod(lod); } } WaitOnPlaces(places);