未验证 提交 47c13508 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #8149 from dzhwinter/fix/mixed_vector

Fix/mixed vector
...@@ -20,6 +20,7 @@ endif() ...@@ -20,6 +20,7 @@ endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
......
...@@ -48,12 +48,26 @@ namespace framework { ...@@ -48,12 +48,26 @@ namespace framework {
*/ */
struct LoD : public std::vector<Vector<size_t>> { struct LoD : public std::vector<Vector<size_t>> {
using std::vector<Vector<size_t>>::vector; using std::vector<Vector<size_t>>::vector;
platform::Place place() const {
if (this->size() == 0) {
// Not Initialze Yet.
return platform::CPUPlace();
} else {
return this->front().place();
}
}
void CopyFromCUDA() { void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) { for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA(); it->CopyFromCUDA();
} }
} }
void CopyToPeer(platform::Place place) {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyToPeer(place);
}
}
}; };
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
......
...@@ -28,28 +28,6 @@ __global__ void test(size_t* a, int size) { ...@@ -28,28 +28,6 @@ __global__ void test(size_t* a, int size) {
} }
} }
TEST(Vector, Normal) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
paddle::framework::InitDevices();
paddle::framework::Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(LoD, data) { TEST(LoD, data) {
paddle::framework::InitDevices(); paddle::framework::InitDevices();
......
...@@ -40,20 +40,21 @@ class Vector : public std::vector<T> { ...@@ -40,20 +40,21 @@ class Vector : public std::vector<T> {
Vector() {} Vector() {}
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT
virtual ~Vector() { inline platform::Place place() const { return place_; }
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) { /*! Return a pointer to constant memory block. */
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_); inline const T *data(platform::Place place) const;
}
#endif /*! Return a pointer to mutable memory block. */
} inline T *mutable_data(platform::Place place);
// TODO(dzhwinter): below interfaces should be removed
/* Get device vector */ /* Get device vector */
T *cuda_data() { T *cuda_data() {
CopyToCUDA(); CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation"); cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_); return static_cast<T *>(cuda_ptr_.get());
} }
/* Get host vector */ /* Get host vector */
...@@ -76,25 +77,73 @@ class Vector : public std::vector<T> { ...@@ -76,25 +77,73 @@ class Vector : public std::vector<T> {
void CopyToPeer(platform::Place); void CopyToPeer(platform::Place);
private: private:
void *cuda_ptr_ = nullptr; std::shared_ptr<void> cuda_ptr_;
size_t cuda_size_ = 0; // device vector numel size_t cuda_size_ = 0; // device vector numel
platform::CUDAPlace place_; platform::CUDAPlace place_;
}; };
template <typename T> template <typename T>
void Vector<T>::CopyToCUDA() { inline const T *Vector<T>::data(platform::Place place) const {
if (platform::is_cpu_place(place)) {
return std::vector<T>::data();
} else if (platform::is_gpu_place(place)) {
if (cuda_ptr_ == nullptr) {
return nullptr;
}
if (boost::get<platform::CUDAPlace>(place) == place_) {
return static_cast<const T *>(cuda_ptr_.get());
} else {
PADDLE_THROW(
"Unmatched place. Please use `mutable_data` copy lod to the target "
"Place first.");
}
} else {
PADDLE_THROW("Unsupport Place.");
}
}
template <typename T>
inline T *Vector<T>::mutable_data(platform::Place place) {
if (platform::is_cpu_place(place)) {
return std::vector<T>::data();
} else if (platform::is_gpu_place(place)) {
if (boost::get<platform::CUDAPlace>(place) != place_) {
place_ = boost::get<platform::CUDAPlace>(place);
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size()) { if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
if (cuda_ptr_ != nullptr) { cuda_ptr_.reset(
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_); memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
} }
cuda_ptr_ = cuda_size_ = this->size();
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait();
return static_cast<T *>(cuda_ptr_.get());
#else
return nullptr;
#endif
} else {
PADDLE_THROW("Unsupport Place.");
}
}
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
cuda_ptr_.reset(
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
} }
cuda_size_ = this->size(); cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_); auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_, platform::CPUPlace(), memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()), static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream()); this->size() * sizeof(T), ctx->stream());
ctx->Wait(); ctx->Wait();
...@@ -112,32 +161,32 @@ void Vector<T>::CopyFromCUDA() { ...@@ -112,32 +161,32 @@ void Vector<T>::CopyFromCUDA() {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_); auto *ctx = pool.GetByPlace(place_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_, memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T), static_cast<const void *>(cuda_ptr_.get()),
ctx->stream()); this->size() * sizeof(T), ctx->stream());
ctx->Wait(); ctx->Wait();
#endif #endif
} }
template <typename T> template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) { void Vector<T>::CopyToPeer(platform::Place place) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_); if (boost::get<platform::CUDAPlace>(place) != place_) {
void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>( place_ = boost::get<platform::CUDAPlace>(place);
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T)); }
memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr, if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream()); cuda_ptr_.reset(
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
memory::PlainDeleter<void, platform::CUDAPlace>(place_));
}
cuda_size_ = this->size();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *ctx = pool.GetByPlace(place_);
memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), ctx->stream());
ctx->Wait(); ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr;
#endif #endif
} }
template class Vector<int>;
template class Vector<unsigned>;
template class Vector<size_t>;
template class Vector<int64_t>;
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda.h>
#include <cuda_runtime.h>
#include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/framework/mixed_vector.h"
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
template <typename T>
__global__ void test(T* data, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) {
data[i] *= 2;
}
}
TEST(Vector, Normal) {
// fill the device context pool.
InitDevices();
Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(Vector, MultipleCopy) {
InitDevices();
Vector<size_t> vec({1, 2, 3});
CUDAPlace place(0);
vec.mutable_data(place);
auto vec2 = Vector<size_t>(vec);
{
const size_t* ptr = vec2.data(CPUPlace());
for (size_t i = 0; i < vec2.size(); ++i) {
EXPECT_EQ(*(ptr + i), vec[i]);
}
}
test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
vec2.CopyFromCUDA();
{
const size_t* ptr = vec2.data(CPUPlace());
for (size_t i = 0; i < vec2.size(); ++i) {
EXPECT_EQ(*(ptr + i), vec[i] * 2);
}
}
}
...@@ -81,5 +81,23 @@ class PODDeleter { ...@@ -81,5 +81,23 @@ class PODDeleter {
Place place_; Place place_;
}; };
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -76,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src, ...@@ -76,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src,
if (src.IsType<LoDTensor>()) { if (src.IsType<LoDTensor>()) {
if (src.Get<LoDTensor>().place() == dst_place) { if (src.Get<LoDTensor>().place() == dst_place) {
dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>()); dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
} else { } else {
Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>()); Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
framework::LoD lod(src.Get<LoDTensor>().lod());
lod.CopyToPeer(dst_place);
dst->GetMutable<LoDTensor>()->set_lod(lod);
} }
} else if (src.IsType<SelectedRows>()) { } else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>(); auto &src_sr = src.Get<SelectedRows>();
auto *dst_sr = dst->GetMutable<SelectedRows>(); auto *dst_sr = dst->GetMutable<SelectedRows>();
dst_sr->set_rows(src_sr.rows());
dst_sr->set_height(src_sr.height()); dst_sr->set_height(src_sr.height());
if (src_sr.value().place() == dst_place) { if (src_sr.value().place() == dst_place) {
dst_sr->mutable_value()->ShareDataWith(src_sr.value()); dst_sr->mutable_value()->ShareDataWith(src_sr.value());
dst_sr->set_rows(src_sr.rows());
} else { } else {
Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
framework::Vector<int64_t> lod(src_sr.rows());
lod.CopyToPeer(dst_place);
dst_sr->set_rows(lod);
} }
} else { } else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
...@@ -145,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -145,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase {
auto *sub_scope = sub_scopes[i]; auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>(); auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
framework::Copy(src, place, dst); framework::Copy(src, place, dst);
framework::LoD lod(src.lod());
lod.CopyToPeer(place);
dst->set_lod(lod);
} }
} }
WaitOnPlaces(places); WaitOnPlaces(places);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册