未验证 提交 abde3130 编写于 作者: D dzhwinter 提交者: GitHub

"remove GPU Sync Interface" (#6793)

* "remove GPU Sync Interface"

* "fix typo"

* "fix type cast error"

* "fix related Copy with stream"

* "fix failed tests with DevicePool"

* "fix stupid removed position error"
上级 658dbe6c
......@@ -40,6 +40,16 @@ class DeviceContextPool {
return *pool;
}
const platform::DeviceContext* Borrow(const platform::Place& place) {
auto range = device_contexts_.equal_range(place);
if (range.first == range.second) {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
return range.first->second;
}
std::vector<const platform::DeviceContext*> Borrow(
const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
......
......@@ -62,33 +62,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
}
}
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
}
template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
}
template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
}
#endif
} // namespace memory
......
......@@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) {
platform::GPUPlace gpu0(0);
platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
framework::DDim src_stride({5, 1});
......@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
gpu_dst);
......@@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) {
platform::GPUPlace gpu0(0);
platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
int dst[8];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
......@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
......
......@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
// note: cudaMemcpy may actually be asynchronous with respect to the caller,
// block on stream 0 to make sure the copy has completed
PADDLE_ENFORCE(
cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
}
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(
......
......@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind);
//! Copy memory from one device to another device.
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream);
......
......@@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) {
CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
......@@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) {
GPUPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]);
......
......@@ -14,6 +14,7 @@
#pragma once
#include <string>
#include "paddle/framework/executor.h"
#include "paddle/framework/tensor.h"
#include "paddle/memory/memcpy.h"
#include "pybind11/numpy.h"
......@@ -61,11 +62,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace()));
// TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
// a Python numpy array. It's better to manage CDUA stream unifiedly.
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost);
framework::DeviceContextPool &pool =
framework::DeviceContextPool::Get();
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
pool.Borrow(tensor.place()));
paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream());
#else
PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
#endif
......@@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray(
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place);
// TODO(qijun): Here we use default CUDA stream to set a Python numpy
// array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice);
framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
}
#endif
......
......@@ -341,6 +341,10 @@ class TestBatchNormOp(OpTest):
places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
places.append(core.GPUPlace(0))
core.init_devices(["CPU", "GPU:0"])
else:
core.init_devices(["CPU"])
for place in places:
for data_format in ["NCHW", "NHWC"]:
test_with_place(place, data_format, [2, 3, 4, 5])
......
import unittest
import numpy
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
from paddle.v2.fluid.op import Operator
import numpy
from paddle.v2.fluid.executor import Executor
class TestGaussianRandomOp(unittest.TestCase):
def setUp(self):
self.op_type = "gaussian_random"
self.inputs = {}
self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
self.outputs = ["Out"]
def test_cpu(self):
self.gaussian_random_test(place=core.CPUPlace())
self.gaussian_random_test(place=fluid.CPUPlace())
def test_gpu(self):
if core.is_compile_gpu():
self.gaussian_random_test(place=core.GPUPlace(0))
self.gaussian_random_test(place=fluid.GPUPlace(0))
def gaussian_random_test(self, place):
scope = core.Scope()
scope.var('Out').get_tensor()
op = Operator(
"gaussian_random",
Out='Out',
shape=[1000, 784],
mean=.0,
std=1.,
seed=10)
context = core.DeviceContext.create(place)
op.run(scope, context)
tensor = numpy.array(scope.find_var('Out').get_tensor())
program = fluid.Program()
block = program.global_block()
vout = block.create_var(name="Out")
op = block.append_op(
type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
fetch_list = []
for var_name in self.outputs:
fetch_list.append(block.var(var_name))
exe = Executor(place)
outs = exe.run(program, fetch_list=fetch_list)
tensor = outs[0]
self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
......
import unittest
import numpy
from paddle.v2.fluid.op import Operator
import paddle.v2.fluid.core as core
import numpy
import paddle.v2.fluid as fluid
class TestUniformRandomOp(unittest.TestCase):
def test_uniform_random_cpu(self):
def setUp(self):
self.op_type = "uniform_random"
self.inputs = {}
self.attrs = {
"shape": [1000, 784],
"min": -5.0,
"max": 10.0,
"seed": 10
}
self.outputs = ["Out"]
def test_cpu(self):
self.uniform_random_test(place=core.CPUPlace())
def test_uniform_random_gpu(self):
def test_gpu(self):
if core.is_compile_gpu():
self.uniform_random_test(place=core.GPUPlace(0))
def uniform_random_test(self, place):
scope = core.Scope()
scope.var('X').get_tensor()
op = Operator(
"uniform_random",
Out='X',
shape=[1000, 784],
min=-5.0,
max=10.0,
seed=10)
ctx = core.DeviceContext.create(place)
op.run(scope, ctx)
tensor = numpy.array(scope.find_var('X').get_tensor())
context = core.DeviceContext.create(place)
program = fluid.Program()
block = program.global_block()
vout = block.create_var(name="Out")
op = block.append_op(
type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
fetch_list = []
for var_name in self.outputs:
fetch_list.append(block.var(var_name))
exe = fluid.Executor(place)
outs = exe.run(program, fetch_list=fetch_list)
tensor = outs[0]
self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册