提交 2603cb7e 编写于 作者: Q QI JUN 提交者: Yi Wang

Unify CUDA stream in Tensor CopyFrom interface (#4692)

* init

* unify CopyFrom interface

* fix gpu build error

* fix bug in tensor_py.h

* refine code comments and add TODO list

* fix conflicts in FeedOp and FetchOp
上级 d92f8de4
...@@ -88,25 +88,30 @@ class Tensor { ...@@ -88,25 +88,30 @@ class Tensor {
* @brief Copy the content of external tensor to a new place. * @brief Copy the content of external tensor to a new place.
* *
* @param[in] src The external tensor. * @param[in] src The external tensor.
* @param[in] ctx The device context contains place where to store. * @param[in] dst_place The dst place.
* @param[in] ctx The device context contains device resources.
* *
* @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU.
*/ */
// TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
// Remove `CopyFrom` and `CopyFromVector` from Tensor interface
// and make them global functions
template <typename T> template <typename T>
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place); inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx);
/** /**
* @brief Copy the content of an external vector to a tensor. * @brief Copy the content of an external vector to a tensor.
* *
* @param[in] src The external vector. * @param[in] src The external tensor.
* @param[in] ctx The device context contains place where to store. * @param[in] ctx The device context contains device resources.
* *
* * @note CopyFromVector assumes that the tensor has been resized * * @note CopyFromVector assumes that the tensor has been resized
* before invoking. * before invoking.
*/ */
template <typename T> template <typename T>
inline void CopyFromVector(const std::vector<T>& src, inline void CopyFromVector(const std::vector<T>& src,
const platform::Place& dst_place); const platform::DeviceContext& ctx);
/** /**
* @brief Return the slice of the tensor. * @brief Return the slice of the tensor.
......
...@@ -95,7 +95,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) { ...@@ -95,7 +95,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {
values_[index].Resize(value.dims()); values_[index].Resize(value.dims());
values_[index].mutable_data<value_type>(platform::CPUPlace()); values_[index].mutable_data<value_type>(platform::CPUPlace());
values_[index].CopyFrom<value_type>(value, platform::CPUPlace()); values_[index].CopyFrom<value_type>(value, platform::CPUPlace(),
platform::CPUDeviceContext());
} }
void TensorArray::WriteShared(size_t index, const LoDTensor& value) { void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
...@@ -151,7 +152,8 @@ LoDTensor TensorArray::Stack() const { ...@@ -151,7 +152,8 @@ LoDTensor TensorArray::Stack() const {
for (size_t idx = 0; idx < size(); idx++) { for (size_t idx = 0; idx < size(); idx++) {
result.Slice<value_type>(idx, idx + 1) result.Slice<value_type>(idx, idx + 1)
.CopyFrom<value_type>(Read(idx), platform::CPUPlace()); .CopyFrom<value_type>(Read(idx), platform::CPUPlace(),
platform::CPUDeviceContext());
} }
return result; return result;
} }
...@@ -182,7 +184,8 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const { ...@@ -182,7 +184,8 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
// copy // copy
value.Resize(value_dims); value.Resize(value_dims);
value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1), value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
platform::CPUPlace()); platform::CPUPlace(),
platform::CPUDeviceContext());
} }
} }
} }
...@@ -236,7 +239,8 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) { ...@@ -236,7 +239,8 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
auto target = result.Slice<value_type>(i, i + 1); auto target = result.Slice<value_type>(i, i + 1);
auto source_ = source->Slice<value_type>(index, index + 1); auto source_ = source->Slice<value_type>(index, index + 1);
target.CopyFrom<value_type>(source_, platform::CPUPlace()); target.CopyFrom<value_type>(source_, platform::CPUPlace(),
platform::CPUDeviceContext());
} }
return result; return result;
...@@ -269,7 +273,8 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source, ...@@ -269,7 +273,8 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
if (index >= seq_meta.end) break; if (index >= seq_meta.end) break;
auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1); auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
auto target = result.Slice<float>(index, index + 1); auto target = result.Slice<float>(index, index + 1);
target.CopyFrom<float>(source_, platform::CPUPlace()); target.CopyFrom<float>(source_, platform::CPUPlace(),
platform::CPUDeviceContext());
} }
} }
......
...@@ -88,7 +88,8 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) { ...@@ -88,7 +88,8 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
template <typename T> template <typename T>
inline void Tensor::CopyFrom(const Tensor& src, inline void Tensor::CopyFrom(const Tensor& src,
const platform::Place& dst_place) { const platform::Place& dst_place,
const platform::DeviceContext& ctx) {
src.check_memory_size<T>(); src.check_memory_size<T>();
Resize(src.dims()); Resize(src.dims());
...@@ -106,26 +107,45 @@ inline void Tensor::CopyFrom(const Tensor& src, ...@@ -106,26 +107,45 @@ inline void Tensor::CopyFrom(const Tensor& src,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && else if (platform::is_gpu_place(src_place) &&
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0); auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_cpu_place(src_place) && } else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0); auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0); auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
PADDLE_ENFORCE(cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in Tensor CopyFrom");
#endif #endif
} }
template <typename T> template <typename T>
inline void Tensor::CopyFromVector(const std::vector<T>& src, inline void Tensor::CopyFromVector(const std::vector<T>& src,
const platform::Place& dst_place) { const platform::DeviceContext& ctx) {
auto dst_place = ctx.GetPlace();
auto src_ptr = static_cast<const void*>(src.data()); auto src_ptr = static_cast<const void*>(src.data());
platform::CPUPlace src_place; platform::CPUPlace src_place;
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place)); auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
...@@ -137,12 +157,11 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src, ...@@ -137,12 +157,11 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { else if (platform::is_gpu_place(dst_place)) {
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, memory::Copy(
src_ptr, size, 0); boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
PADDLE_ENFORCE(cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in Tensor CopyFromVector");
#endif #endif
} }
......
...@@ -194,6 +194,7 @@ TEST(Tensor, CopyFrom) { ...@@ -194,6 +194,7 @@ TEST(Tensor, CopyFrom) {
{ {
Tensor src_tensor; Tensor src_tensor;
Tensor dst_tensor; Tensor dst_tensor;
CPUDeviceContext cpu_ctx((CPUPlace()));
int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace()); int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
...@@ -201,7 +202,7 @@ TEST(Tensor, CopyFrom) { ...@@ -201,7 +202,7 @@ TEST(Tensor, CopyFrom) {
memcpy(src_ptr, arr, 9 * sizeof(int)); memcpy(src_ptr, arr, 9 * sizeof(int));
auto cpu_place = new paddle::platform::CPUPlace(); auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(src_tensor, *cpu_place); dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx);
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
...@@ -210,7 +211,7 @@ TEST(Tensor, CopyFrom) { ...@@ -210,7 +211,7 @@ TEST(Tensor, CopyFrom) {
} }
Tensor slice_tensor = src_tensor.Slice<int>(1, 2); Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place); dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx);
const int* slice_ptr = slice_tensor.data<int>(); const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>(); dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr); ASSERT_NE(dst_ptr, slice_ptr);
...@@ -231,13 +232,15 @@ TEST(Tensor, CopyFrom) { ...@@ -231,13 +232,15 @@ TEST(Tensor, CopyFrom) {
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new paddle::platform::GPUPlace(0); auto gpu_place = new paddle::platform::GPUPlace(0);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place); CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
auto cpu_place = new paddle::platform::CPUPlace(); auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place); dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
// Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait();
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 9; ++i) { for (size_t i = 0; i < 9; ++i) {
...@@ -247,12 +250,13 @@ TEST(Tensor, CopyFrom) { ...@@ -247,12 +250,13 @@ TEST(Tensor, CopyFrom) {
Tensor slice_tensor = src_tensor.Slice<int>(1, 2); Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
// CPU Slice Tensor to GPU Tensor // CPU Slice Tensor to GPU Tensor
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place); gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place); dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
// Compare Slice Tensors // Sync before Compare Slice Tensors
gpu_ctx.Wait();
const int* slice_ptr = slice_tensor.data<int>(); const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>(); dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr); ASSERT_NE(dst_ptr, slice_ptr);
...@@ -273,7 +277,8 @@ TEST(Tensor, CopyFromVector) { ...@@ -273,7 +277,8 @@ TEST(Tensor, CopyFromVector) {
// Copy to CPU Tensor // Copy to CPU Tensor
cpu_tensor.Resize(make_ddim({3, 3})); cpu_tensor.Resize(make_ddim({3, 3}));
auto cpu_place = new paddle::platform::CPUPlace(); auto cpu_place = new paddle::platform::CPUPlace();
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place); CPUDeviceContext cpu_ctx(*cpu_place);
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
// Compare Tensors // Compare Tensors
const int* cpu_ptr = cpu_tensor.data<int>(); const int* cpu_ptr = cpu_tensor.data<int>();
...@@ -285,7 +290,7 @@ TEST(Tensor, CopyFromVector) { ...@@ -285,7 +290,7 @@ TEST(Tensor, CopyFromVector) {
src_vec.erase(src_vec.begin(), src_vec.begin() + 5); src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
cpu_tensor.Resize(make_ddim({2, 2})); cpu_tensor.Resize(make_ddim({2, 2}));
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place); cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
cpu_ptr = cpu_tensor.data<int>(); cpu_ptr = cpu_tensor.data<int>();
src_ptr = src_vec.data(); src_ptr = src_vec.data();
ASSERT_NE(src_ptr, cpu_ptr); ASSERT_NE(src_ptr, cpu_ptr);
...@@ -306,16 +311,19 @@ TEST(Tensor, CopyFromVector) { ...@@ -306,16 +311,19 @@ TEST(Tensor, CopyFromVector) {
// Copy to CPU Tensor // Copy to CPU Tensor
cpu_tensor.Resize(make_ddim({3, 3})); cpu_tensor.Resize(make_ddim({3, 3}));
auto cpu_place = new paddle::platform::CPUPlace(); auto cpu_place = new paddle::platform::CPUPlace();
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place); CPUDeviceContext cpu_ctx(*cpu_place);
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
// Copy to GPUTensor // Copy to GPUTensor
gpu_tensor.Resize(make_ddim({3, 3})); gpu_tensor.Resize(make_ddim({3, 3}));
auto gpu_place = new paddle::platform::GPUPlace(); auto gpu_place = new paddle::platform::GPUPlace();
gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place); CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
// Copy from GPU to CPU tensor for comparison // Copy from GPU to CPU tensor for comparison
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place); dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
// Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait();
const int* src_ptr = src_vec.data(); const int* src_ptr = src_vec.data();
const int* cpu_ptr = cpu_tensor.data<int>(); const int* cpu_ptr = cpu_tensor.data<int>();
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
...@@ -329,11 +337,13 @@ TEST(Tensor, CopyFromVector) { ...@@ -329,11 +337,13 @@ TEST(Tensor, CopyFromVector) {
src_vec.erase(src_vec.begin(), src_vec.begin() + 5); src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
cpu_tensor.Resize(make_ddim({2, 2})); cpu_tensor.Resize(make_ddim({2, 2}));
cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place); cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
gpu_tensor.Resize(make_ddim({2, 2})); gpu_tensor.Resize(make_ddim({2, 2}));
gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place); gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place); dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors
gpu_ctx.Wait();
src_ptr = src_vec.data(); src_ptr = src_vec.data();
cpu_ptr = cpu_tensor.data<int>(); cpu_ptr = cpu_tensor.data<int>();
dst_ptr = dst_tensor.data<int>(); dst_ptr = dst_tensor.data<int>();
......
...@@ -34,7 +34,7 @@ class FeedKernel : public framework::OpKernel<T> { ...@@ -34,7 +34,7 @@ class FeedKernel : public framework::OpKernel<T> {
// TODO(qijun): // TODO(qijun):
// check tensors[col].dims() with attribute, // check tensors[col].dims() with attribute,
// except the first dimenson. // except the first dimenson.
out->CopyFrom<T>(tensors[col], ctx.GetPlace()); out->CopyFrom<T>(tensors[col], ctx.GetPlace(), ctx.device_context());
} }
}; };
......
...@@ -35,7 +35,8 @@ class FetchKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,8 @@ class FetchKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GT(tensors->size(), static_cast<size_t>(col)); PADDLE_ENFORCE_GT(tensors->size(), static_cast<size_t>(col));
(*tensors)[col].Resize(input->dims()); (*tensors)[col].Resize(input->dims());
(*tensors)[col].mutable_data<T>(platform::CPUPlace()); (*tensors)[col].mutable_data<T>(platform::CPUPlace());
(*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace()); (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace(),
ctx.device_context());
// TODO(qijun): need to handle LodTensor later // TODO(qijun): need to handle LodTensor later
} }
}; };
......
...@@ -49,10 +49,22 @@ void testIm2col() { ...@@ -49,10 +49,22 @@ void testIm2col() {
memcpy(input_ptr, arr, 6 * sizeof(float)); memcpy(input_ptr, arr, 6 * sizeof(float));
auto* place = new Place(); auto* place = new Place();
paddle::platform::DeviceContext* context;
if (paddle::platform::is_cpu_place(*place)) {
context =
new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
} else {
#ifdef PADDLE_WITH_CUDA
context =
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
#else
PADDLE_THROW("no GPU support");
#endif // PADDLE_ONLY_CPU
}
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
input.CopyFrom<float>(input_tmp, *place); input.CopyFrom<float>(input_tmp, *place, *context);
} }
output_cfo.mutable_data<float>( output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place); {1, filter_size, filter_size, output_height, output_width}, *place);
...@@ -66,18 +78,6 @@ void testIm2col() { ...@@ -66,18 +78,6 @@ void testIm2col() {
paddle::operators::math::ColFormat::kOCF, Place, float> paddle::operators::math::ColFormat::kOCF, Place, float>
im2col_ocf; im2col_ocf;
paddle::platform::DeviceContext* context;
if (paddle::platform::is_cpu_place(*place)) {
context =
new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
} else {
#ifdef PADDLE_WITH_CUDA
context =
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
#else
PADDLE_THROW("no GPU support");
#endif // PADDLE_ONLY_CPU
}
im2col(*context, input, output_cfo, stride, stride, padding, padding); im2col(*context, input, output_cfo, stride, stride, padding, padding);
im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding); im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding);
...@@ -85,7 +85,8 @@ void testIm2col() { ...@@ -85,7 +85,8 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>(); out_cfo_ptr = output_cfo.data<float>();
} else { } else {
output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace()); output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace(),
*context);
out_cfo_ptr = output_tmp.data<float>(); out_cfo_ptr = output_tmp.data<float>();
} }
EXPECT_EQ(out_cfo_ptr[0], 0); EXPECT_EQ(out_cfo_ptr[0], 0);
...@@ -101,7 +102,8 @@ void testIm2col() { ...@@ -101,7 +102,8 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>(); out_ocf_ptr = output_ocf.data<float>();
} else { } else {
output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace()); output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace(),
*context);
out_ocf_ptr = output_tmp.data<float>(); out_ocf_ptr = output_tmp.data<float>();
} }
EXPECT_EQ(out_ocf_ptr[0], 0); EXPECT_EQ(out_ocf_ptr[0], 0);
......
...@@ -17,17 +17,18 @@ TEST(math_function, notrans_mul_trans) { ...@@ -17,17 +17,18 @@ TEST(math_function, notrans_mul_trans) {
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place); input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input1, *gpu_place); input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
out_gpu.mutable_data<float>({2, 2}, *gpu_place); out_gpu.mutable_data<float>({2, 2}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>( paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place); out.CopyFrom<float>(out_gpu, *cpu_place, context);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait();
EXPECT_EQ(out_ptr[0], 5); EXPECT_EQ(out_ptr[0], 5);
EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[1], 14);
EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[2], 14);
...@@ -50,17 +51,18 @@ TEST(math_function, trans_mul_notrans) { ...@@ -50,17 +51,18 @@ TEST(math_function, trans_mul_notrans) {
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place); input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input1, *gpu_place); input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
out_gpu.mutable_data<float>({3, 3}, *gpu_place); out_gpu.mutable_data<float>({3, 3}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>( paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place); out.CopyFrom<float>(out_gpu, *cpu_place, context);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait();
EXPECT_EQ(out_ptr[0], 9); EXPECT_EQ(out_ptr[0], 9);
EXPECT_EQ(out_ptr[1], 12); EXPECT_EQ(out_ptr[1], 12);
EXPECT_EQ(out_ptr[2], 15); EXPECT_EQ(out_ptr[2], 15);
...@@ -98,9 +100,9 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -98,9 +100,9 @@ TEST(math_function, gemm_notrans_cublas) {
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place); input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input2, *gpu_place); input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
input3_gpu.CopyFrom<float>(input3, *gpu_place); input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place); float* c = input3_gpu.mutable_data<float>(*gpu_place);
...@@ -108,7 +110,7 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -108,7 +110,7 @@ TEST(math_function, gemm_notrans_cublas) {
paddle::operators::math::gemm<paddle::platform::GPUPlace, float>( paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
input3.CopyFrom<float>(input3_gpu, *cpu_place); input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -116,6 +118,7 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -116,6 +118,7 @@ TEST(math_function, gemm_notrans_cublas) {
// c = np.arange(8).reshape(2, 4)[:, 1:] // c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4) // out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c // out[:, 1:] = np.dot(a, b) + c
context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[1], 24);
EXPECT_EQ(input3_ptr[2], 28); EXPECT_EQ(input3_ptr[2], 28);
...@@ -152,9 +155,9 @@ TEST(math_function, gemm_trans_cublas) { ...@@ -152,9 +155,9 @@ TEST(math_function, gemm_trans_cublas) {
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place); input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input2, *gpu_place); input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
input3_gpu.CopyFrom<float>(input3, *gpu_place); input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place); float* c = input3_gpu.mutable_data<float>(*gpu_place);
...@@ -162,7 +165,8 @@ TEST(math_function, gemm_trans_cublas) { ...@@ -162,7 +165,8 @@ TEST(math_function, gemm_trans_cublas) {
paddle::operators::math::gemm<paddle::platform::GPUPlace, float>( paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
input3.CopyFrom<float>(input3_gpu, *cpu_place); input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[1], 24);
......
...@@ -33,7 +33,8 @@ class MultiplexGPUKernel : public framework::OpKernel<T> { ...@@ -33,7 +33,8 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace()); index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
ctx.device_context());
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>( auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context()) ctx.device_context())
...@@ -70,7 +71,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { ...@@ -70,7 +71,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace()); index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
ctx.device_context());
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>( auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
......
...@@ -46,7 +46,7 @@ void RecurrentAlgorithm::Run(const Scope& scope, ...@@ -46,7 +46,7 @@ void RecurrentAlgorithm::Run(const Scope& scope,
} }
(*stepnet_)->Run(*step_scopes[step_id], dev_ctx); (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
} }
rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len); rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len, dev_ctx);
} }
void RecurrentAlgorithm::CreateScopes(const Scope& scope, void RecurrentAlgorithm::CreateScopes(const Scope& scope,
...@@ -151,12 +151,12 @@ void RecurrentGradientAlgorithm::Run( ...@@ -151,12 +151,12 @@ void RecurrentGradientAlgorithm::Run(
auto& step_scopes = GetStepScopes(scope); auto& step_scopes = GetStepScopes(scope);
rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len); rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
for (int step_id = seq_len - 1; step_id >= 0; --step_id) { for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
if (step_id != seq_len - 1) { if (static_cast<size_t>(step_id) != seq_len - 1) {
rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
} }
(*stepnet_)->Run(*step_scopes[step_id], dev_ctx); (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
} }
rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len); rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len, dev_ctx);
LinkBootMemoryGradients(step_scopes[0]); LinkBootMemoryGradients(step_scopes[0]);
} }
......
...@@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel<T> { ...@@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
std::transform(shape.begin(), shape.end(), shape_int64.begin(), std::transform(shape.begin(), shape.end(), shape_int64.begin(),
[](int a) { return static_cast<int64_t>(a); }); [](int a) { return static_cast<int64_t>(a); });
auto out_dims = framework::make_ddim(shape_int64); auto out_dims = framework::make_ddim(shape_int64);
out->CopyFrom<T>(*in, ctx.GetPlace()); out->CopyFrom<T>(*in, ctx.GetPlace(), ctx.device_context());
out->Resize(out_dims); out->Resize(out_dims);
} }
}; };
...@@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> { ...@@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
d_x->mutable_data<T>(ctx.GetPlace()); d_x->mutable_data<T>(ctx.GetPlace());
auto in_dims = d_x->dims(); auto in_dims = d_x->dims();
d_x->CopyFrom<T>(*d_out, ctx.GetPlace()); d_x->CopyFrom<T>(*d_out, ctx.GetPlace(), ctx.device_context());
d_x->Resize(in_dims); d_x->Resize(in_dims);
} }
}; };
......
...@@ -51,7 +51,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes, ...@@ -51,7 +51,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
void ConcatOutputs(const std::vector<Scope*>& step_scopes, void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<std::string>& outlinks, const std::vector<std::string>& outlinks,
const size_t seq_len) { const size_t seq_len, const platform::DeviceContext& ctx) {
for (size_t i = 0; i < outlinks.size(); i++) { for (size_t i = 0; i < outlinks.size(); i++) {
auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]); auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.", PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
...@@ -72,7 +72,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes, ...@@ -72,7 +72,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
// TODO(luotao02) data type and platform::DeviceContext() should set // TODO(luotao02) data type and platform::DeviceContext() should set
// correctly // correctly
(output->Slice<float>(j, j + 1)) (output->Slice<float>(j, j + 1))
.CopyFrom<float>(*step_output, platform::CPUPlace()); .CopyFrom<float>(*step_output, platform::CPUPlace(), ctx);
} }
} }
} }
......
...@@ -71,7 +71,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes, ...@@ -71,7 +71,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
*/ */
void ConcatOutputs(const std::vector<Scope*>& step_scopes, void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<std::string>& outlinks, const std::vector<std::string>& outlinks,
const size_t seq_len); const size_t seq_len, const platform::DeviceContext& ctx);
void LinkMemories(const std::vector<Scope*>& step_scopes, void LinkMemories(const std::vector<Scope*>& step_scopes,
const std::vector<MemoryAttr>& memories, const size_t step_id, const std::vector<MemoryAttr>& memories, const size_t step_id,
......
...@@ -57,7 +57,18 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -57,7 +57,18 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
} }
framework::Tensor dst_tensor; framework::Tensor dst_tensor;
if (paddle::platform::is_gpu_place(tensor.place())) { if (paddle::platform::is_gpu_place(tensor.place())) {
dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace()); #ifdef PADDLE_WITH_CUDA
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace()));
// TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
// a Python numpy array. It's better to manage CDUA stream unifiedly.
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost);
#else
PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
#endif
} else if (paddle::platform::is_cpu_place(tensor.place())) { } else if (paddle::platform::is_cpu_place(tensor.place())) {
dst_tensor = tensor; dst_tensor = tensor;
} }
...@@ -120,6 +131,8 @@ void PyCUDATensorSetFromArray( ...@@ -120,6 +131,8 @@ void PyCUDATensorSetFromArray(
self.Resize(framework::make_ddim(dims)); self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self.mutable_data<T>(place);
// TODO(qijun): Here we use default CUDA stream to set a Python numpy
// array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册