未验证 提交 abde3130 编写于 作者: D dzhwinter 提交者: GitHub

"remove GPU Sync Interface" (#6793)

* "remove GPU Sync Interface"

* "fix typo"

* "fix type cast error"

* "fix related Copy with stream"

* "fix failed tests with DevicePool"

* "fix stupid removed position error"
上级 658dbe6c
...@@ -40,6 +40,16 @@ class DeviceContextPool { ...@@ -40,6 +40,16 @@ class DeviceContextPool {
return *pool; return *pool;
} }
const platform::DeviceContext* Borrow(const platform::Place& place) {
auto range = device_contexts_.equal_range(place);
if (range.first == range.second) {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
return range.first->second;
}
std::vector<const platform::DeviceContext*> Borrow( std::vector<const platform::DeviceContext*> Borrow(
const std::vector<platform::Place>& places) { const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
......
...@@ -62,33 +62,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place, ...@@ -62,33 +62,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
} }
} }
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
}
template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
}
template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
}
#endif #endif
} // namespace memory } // namespace memory
......
...@@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) {
platform::GPUPlace gpu0(0); platform::GPUPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src))); int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
framework::DDim src_stride({5, 1}); framework::DDim src_stride({5, 1});
...@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
framework::DDim dst_dim({2, 2}); framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1}); framework::DDim dst_stride({2, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
gpu_dst); gpu_dst);
...@@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) {
platform::GPUPlace gpu0(0); platform::GPUPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src))); int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
int dst[8]; int dst[8];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst))); int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
...@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
framework::DDim src_stride({2, 1}); framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2}); framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1}); framework::DDim dst_stride({4, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
......
...@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count, ...@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
} }
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
// note: cudaMemcpy may actually be asynchronous with respect to the caller,
// block on stream 0 to make sure the copy has completed
PADDLE_ENFORCE(
cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
}
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream) { size_t count, cudaStream_t stream) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
......
...@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize(); ...@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream); enum cudaMemcpyKind kind, cudaStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind);
//! Copy memory from one device to another device. //! Copy memory from one device to another device.
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream); size_t count, cudaStream_t stream);
......
...@@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) { ...@@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) {
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4)); float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans; Transform<paddle::platform::CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10)); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5); ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
...@@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) { ...@@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) {
GPUPlace gpu0(0); GPUPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf))); int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans; Transform<paddle::platform::CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>()); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]); ASSERT_EQ((i + 1) * (i + 1), buf[i]);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/framework/executor.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
...@@ -61,11 +62,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -61,11 +62,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>()); auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace())); tensor.dims(), platform::CPUPlace()));
// TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
// a Python numpy array. It's better to manage CDUA stream unifiedly. framework::DeviceContextPool &pool =
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, framework::DeviceContextPool::Get();
sizeof(CUR_TYPE) * tensor.numel(), auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
cudaMemcpyDeviceToHost); pool.Borrow(tensor.place()));
paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream());
#else #else
PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
#endif #endif
...@@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray( ...@@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray(
self.Resize(framework::make_ddim(dims)); self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self.mutable_data<T>(place);
// TODO(qijun): Here we use default CUDA stream to set a Python numpy
// array to a GPU Tensor. It's better to manage CDUA stream unifiedly. framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), auto dev_ctx =
cudaMemcpyHostToDevice); static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
} }
#endif #endif
......
...@@ -341,6 +341,10 @@ class TestBatchNormOp(OpTest): ...@@ -341,6 +341,10 @@ class TestBatchNormOp(OpTest):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
places.append(core.GPUPlace(0)) places.append(core.GPUPlace(0))
core.init_devices(["CPU", "GPU:0"])
else:
core.init_devices(["CPU"])
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: for data_format in ["NCHW", "NHWC"]:
test_with_place(place, data_format, [2, 3, 4, 5]) test_with_place(place, data_format, [2, 3, 4, 5])
......
import unittest import unittest
import numpy
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.op import Operator from paddle.v2.fluid.op import Operator
import numpy from paddle.v2.fluid.executor import Executor
class TestGaussianRandomOp(unittest.TestCase): class TestGaussianRandomOp(unittest.TestCase):
def setUp(self):
self.op_type = "gaussian_random"
self.inputs = {}
self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
self.outputs = ["Out"]
def test_cpu(self): def test_cpu(self):
self.gaussian_random_test(place=core.CPUPlace()) self.gaussian_random_test(place=fluid.CPUPlace())
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compile_gpu():
self.gaussian_random_test(place=core.GPUPlace(0)) self.gaussian_random_test(place=fluid.GPUPlace(0))
def gaussian_random_test(self, place): def gaussian_random_test(self, place):
scope = core.Scope()
scope.var('Out').get_tensor()
op = Operator(
"gaussian_random",
Out='Out',
shape=[1000, 784],
mean=.0,
std=1.,
seed=10)
context = core.DeviceContext.create(place) context = core.DeviceContext.create(place)
op.run(scope, context) program = fluid.Program()
tensor = numpy.array(scope.find_var('Out').get_tensor()) block = program.global_block()
vout = block.create_var(name="Out")
op = block.append_op(
type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
fetch_list = []
for var_name in self.outputs:
fetch_list.append(block.var(var_name))
exe = Executor(place)
outs = exe.run(program, fetch_list=fetch_list)
tensor = outs[0]
self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1) self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1) self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
......
import unittest import unittest
import numpy
from paddle.v2.fluid.op import Operator from paddle.v2.fluid.op import Operator
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import numpy import paddle.v2.fluid as fluid
class TestUniformRandomOp(unittest.TestCase): class TestUniformRandomOp(unittest.TestCase):
def test_uniform_random_cpu(self): def setUp(self):
self.op_type = "uniform_random"
self.inputs = {}
self.attrs = {
"shape": [1000, 784],
"min": -5.0,
"max": 10.0,
"seed": 10
}
self.outputs = ["Out"]
def test_cpu(self):
self.uniform_random_test(place=core.CPUPlace()) self.uniform_random_test(place=core.CPUPlace())
def test_uniform_random_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compile_gpu():
self.uniform_random_test(place=core.GPUPlace(0)) self.uniform_random_test(place=core.GPUPlace(0))
def uniform_random_test(self, place): def uniform_random_test(self, place):
scope = core.Scope() context = core.DeviceContext.create(place)
scope.var('X').get_tensor() program = fluid.Program()
block = program.global_block()
op = Operator( vout = block.create_var(name="Out")
"uniform_random", op = block.append_op(
Out='X', type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
shape=[1000, 784],
min=-5.0, op.desc.infer_var_type(block.desc)
max=10.0, op.desc.infer_shape(block.desc)
seed=10)
fetch_list = []
ctx = core.DeviceContext.create(place) for var_name in self.outputs:
op.run(scope, ctx) fetch_list.append(block.var(var_name))
tensor = numpy.array(scope.find_var('X').get_tensor())
exe = fluid.Executor(place)
outs = exe.run(program, fetch_list=fetch_list)
tensor = outs[0]
self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1) self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册