From abde3130b7ce5b8e8e3c74cd0670be2ce1e8eb6e Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Fri, 22 Dec 2017 12:35:40 +0800 Subject: [PATCH] "remove GPU Sync Interface" (#6793) * "remove GPU Sync Interface" * "fix typo" * "fix type cast error" * "fix related Copy with stream" * "fix failed tests with DevicePool" * "fix stupid removed position error" --- paddle/framework/executor.h | 10 ++++ paddle/memory/memcpy.cc | 27 ---------- paddle/operators/strided_memcpy_test.cc | 9 ++-- paddle/platform/gpu_info.cc | 11 ---- paddle/platform/gpu_info.h | 4 -- paddle/platform/transform_test.cu | 8 +-- paddle/pybind/tensor_py.h | 25 +++++---- .../v2/fluid/tests/test_batch_norm_op.py | 4 ++ .../v2/fluid/tests/test_gaussian_random_op.py | 45 ++++++++++------ .../v2/fluid/tests/test_uniform_random_op.py | 52 +++++++++++++------ 10 files changed, 104 insertions(+), 91 deletions(-) diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 1faaacfefa3..fb861d47126 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -40,6 +40,16 @@ class DeviceContextPool { return *pool; } + const platform::DeviceContext* Borrow(const platform::Place& place) { + auto range = device_contexts_.equal_range(place); + if (range.first == range.second) { + PADDLE_THROW( + "'Place' is not supported, Please re-compile with WITH_GPU " + "option"); + } + return range.first->second; + } + std::vector Borrow( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc index 1df88a6da9f..5c629dc3d2a 100644 --- a/paddle/memory/memcpy.cc +++ b/paddle/memory/memcpy.cc @@ -62,33 +62,6 @@ void Copy(platform::GPUPlace dst_place, } } -template <> -void Copy(platform::CPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num) { - platform::SetDeviceId(src_place.device); - platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); -} - -template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::CPUPlace src_place, - const void* src, size_t num) { - platform::SetDeviceId(dst_place.device); - platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); -} - -template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num) { - platform::SetDeviceId(dst_place.device); - platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); -} - #endif } // namespace memory diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc index 68f064eaee5..230cc1ab0bb 100644 --- a/paddle/operators/strided_memcpy_test.cc +++ b/paddle/operators/strided_memcpy_test.cc @@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) { platform::GPUPlace gpu0(0); platform::CPUPlace cpu; + platform::CUDADeviceContext ctx(gpu0); + int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); - memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); framework::DDim src_stride({5, 1}); @@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) { framework::DDim dst_dim({2, 2}); framework::DDim dst_stride({2, 1}); - platform::CUDADeviceContext ctx(gpu0); StridedMemcpy(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst); @@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) { platform::GPUPlace gpu0(0); platform::CPUPlace cpu; + platform::CUDADeviceContext ctx(gpu0); int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); - memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); + memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); int dst[8]; int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); @@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) { framework::DDim src_stride({2, 1}); framework::DDim dst_dim({2, 2}); framework::DDim dst_stride({4, 1}); - platform::CUDADeviceContext ctx(gpu0); StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); StridedMemcpy(ctx, gpu_src, src_stride, dst_dim, dst_stride, diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 541eca5f39c..7037551d754 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count, "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); } -void GpuMemcpySync(void *dst, const void *src, size_t count, - enum cudaMemcpyKind kind) { - PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), - "cudaMemcpy failed in paddle::platform::GpuMemcpySync"); - // note: cudaMemcpy may actually be asynchronous with respect to the caller, - // block on stream 0 to make sure the copy has completed - PADDLE_ENFORCE( - cudaStreamSynchronize(0), - "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync"); -} - void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t count, cudaStream_t stream) { PADDLE_ENFORCE( diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index db961f3838a..d05131fa419 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -52,10 +52,6 @@ size_t GpuMaxChunkSize(); void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); -//! Copy memory from address src to dst synchronously. -void GpuMemcpySync(void *dst, const void *src, size_t count, - enum cudaMemcpyKind kind); - //! Copy memory from one device to another device. void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, size_t count, cudaStream_t stream); diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index d36eac8379e..464096111e4 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) { CUDADeviceContext ctx(gpu0); float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); - Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); + Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); - Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); + Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(cpu_buf[i], static_cast(i + 1), 1e-5); @@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) { GPUPlace gpu0(0); CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); - Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); + Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); - Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); + Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { ASSERT_EQ((i + 1) * (i + 1), buf[i]); diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 41fa658502d..268a0f2fa38 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -14,6 +14,7 @@ #pragma once #include +#include "paddle/framework/executor.h" #include "paddle/framework/tensor.h" #include "paddle/memory/memcpy.h" #include "pybind11/numpy.h" @@ -61,11 +62,15 @@ struct CastToPyBufferImpl { auto *src_ptr = static_cast(tensor.data()); auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); - // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to - // a Python numpy array. It's better to manage CDUA stream unifiedly. - paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, - sizeof(CUR_TYPE) * tensor.numel(), - cudaMemcpyDeviceToHost); + + framework::DeviceContextPool &pool = + framework::DeviceContextPool::Get(); + auto dev_ctx = static_cast( + pool.Borrow(tensor.place())); + + paddle::platform::GpuMemcpyAsync( + dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), + cudaMemcpyDeviceToHost, dev_ctx->stream()); #else PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); #endif @@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray( self.Resize(framework::make_ddim(dims)); auto *dst = self.mutable_data(place); - // TODO(qijun): Here we use default CUDA stream to set a Python numpy - // array to a GPU Tensor. It's better to manage CDUA stream unifiedly. - paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), - cudaMemcpyHostToDevice); + + framework::DeviceContextPool &pool = framework::DeviceContextPool::Get(); + auto dev_ctx = + static_cast(pool.Borrow(place)); + paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), + cudaMemcpyHostToDevice, dev_ctx->stream()); } #endif diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py index dee2febb83d..ec71d391e61 100644 --- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py @@ -341,6 +341,10 @@ class TestBatchNormOp(OpTest): places = [core.CPUPlace()] if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): places.append(core.GPUPlace(0)) + + core.init_devices(["CPU", "GPU:0"]) + else: + core.init_devices(["CPU"]) for place in places: for data_format in ["NCHW", "NHWC"]: test_with_place(place, data_format, [2, 3, 4, 5]) diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py index 627ab4e2356..a9d943b8b7f 100644 --- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py +++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py @@ -1,32 +1,47 @@ import unittest +import numpy + +import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core from paddle.v2.fluid.op import Operator -import numpy +from paddle.v2.fluid.executor import Executor class TestGaussianRandomOp(unittest.TestCase): + def setUp(self): + self.op_type = "gaussian_random" + self.inputs = {} + self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10} + + self.outputs = ["Out"] + def test_cpu(self): - self.gaussian_random_test(place=core.CPUPlace()) + self.gaussian_random_test(place=fluid.CPUPlace()) def test_gpu(self): if core.is_compile_gpu(): - self.gaussian_random_test(place=core.GPUPlace(0)) + self.gaussian_random_test(place=fluid.GPUPlace(0)) def gaussian_random_test(self, place): - scope = core.Scope() - scope.var('Out').get_tensor() - - op = Operator( - "gaussian_random", - Out='Out', - shape=[1000, 784], - mean=.0, - std=1., - seed=10) context = core.DeviceContext.create(place) - op.run(scope, context) - tensor = numpy.array(scope.find_var('Out').get_tensor()) + program = fluid.Program() + block = program.global_block() + vout = block.create_var(name="Out") + op = block.append_op( + type=self.op_type, outputs={"Out": vout}, attrs=self.attrs) + + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + + fetch_list = [] + for var_name in self.outputs: + fetch_list.append(block.var(var_name)) + + exe = Executor(place) + outs = exe.run(program, fetch_list=fetch_list) + tensor = outs[0] + self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1) self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1) diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py index f736dfb2e85..00b4f196209 100644 --- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py +++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py @@ -1,32 +1,50 @@ import unittest +import numpy + from paddle.v2.fluid.op import Operator import paddle.v2.fluid.core as core -import numpy +import paddle.v2.fluid as fluid class TestUniformRandomOp(unittest.TestCase): - def test_uniform_random_cpu(self): + def setUp(self): + self.op_type = "uniform_random" + self.inputs = {} + self.attrs = { + "shape": [1000, 784], + "min": -5.0, + "max": 10.0, + "seed": 10 + } + self.outputs = ["Out"] + + def test_cpu(self): self.uniform_random_test(place=core.CPUPlace()) - def test_uniform_random_gpu(self): + def test_gpu(self): if core.is_compile_gpu(): self.uniform_random_test(place=core.GPUPlace(0)) def uniform_random_test(self, place): - scope = core.Scope() - scope.var('X').get_tensor() - - op = Operator( - "uniform_random", - Out='X', - shape=[1000, 784], - min=-5.0, - max=10.0, - seed=10) - - ctx = core.DeviceContext.create(place) - op.run(scope, ctx) - tensor = numpy.array(scope.find_var('X').get_tensor()) + context = core.DeviceContext.create(place) + program = fluid.Program() + block = program.global_block() + vout = block.create_var(name="Out") + op = block.append_op( + type=self.op_type, outputs={"Out": vout}, attrs=self.attrs) + + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + + fetch_list = [] + for var_name in self.outputs: + fetch_list.append(block.var(var_name)) + + exe = fluid.Executor(place) + outs = exe.run(program, fetch_list=fetch_list) + + tensor = outs[0] + self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1) -- GitLab