From 9c7f6af5863e5848467e5953f6a773580903feb1 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Mon, 19 Jul 2021 17:44:21 +0800 Subject: [PATCH] Add Cuda event and stream API (#32460) * add cuda event and stream api * add cuda event and stream api * add get_current_stream api * add get_current_stream api * init streams * modify get_current_stream * modify get_cuttent_stream * add synchronize func * add current_stream doc and test file * move get_current_stream into CUDA macro * move CudaEvent into CUDA macro * move _get_current_stream and _device_synchronize into cuda macro * modify the macro of cuda stream and event * add test case for synchronize * add paddle.devices.cuda module * event and stream support hip * add doc for stream and event class * move cuda stream and event into single pybind * add cuda_streams_py.cc to cmakelist * add _device_synchronize and _get_current_stream to core module * add test case for cudastream and cudaevent * move __all__ in streams.py * fix test fail * add cuda to devices __all__ * fix current_stream doc writing error * move devices to device direction, and merge device.py into __init__.py * add required:gpu to sample codes * remove cuda direction from device/__init__.py --- paddle/fluid/platform/event.h | 94 +++++ paddle/fluid/platform/stream/cuda_stream.cc | 23 ++ paddle/fluid/platform/stream/cuda_stream.h | 38 +- paddle/fluid/pybind/CMakeLists.txt | 3 +- paddle/fluid/pybind/cuda_streams_py.cc | 351 ++++++++++++++++++ paddle/fluid/pybind/cuda_streams_py.h | 28 ++ paddle/fluid/pybind/pybind.cc | 4 + python/paddle/__init__.py | 1 + .../paddle/{device.py => device/__init__.py} | 10 +- python/paddle/device/cuda/__init__.py | 96 +++++ python/paddle/device/cuda/streams.py | 16 + python/paddle/fluid/core.py | 4 + .../tests/unittests/test_cuda_stream_event.py | 108 ++++++ python/setup.py.in | 2 + 14 files changed, 769 insertions(+), 9 deletions(-) create mode 100644 paddle/fluid/pybind/cuda_streams_py.cc create mode 100644 paddle/fluid/pybind/cuda_streams_py.h rename python/paddle/{device.py => device/__init__.py} (98%) create mode 100644 python/paddle/device/cuda/__init__.py create mode 100644 python/paddle/device/cuda/streams.py create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_stream_event.py diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 3a81cfab865..93fc56ab203 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -22,6 +22,7 @@ limitations under the License. */ #include #endif #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" namespace paddle { namespace platform { @@ -117,5 +118,98 @@ class MemEvent { std::string annotation_; }; +class CudaEvent { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + public: + CudaEvent() { +#ifdef PADDLE_WITH_HIP + hipEventCreateWithFlags(&event_, flags_); +#else + cudaEventCreateWithFlags(&event_, flags_); +#endif + } + + CudaEvent(unsigned int flags) : flags_(flags) { +#ifdef PADDLE_WITH_HIP + hipEventCreateWithFlags(&event_, flags_); +#else + cudaEventCreateWithFlags(&event_, flags_); +#endif + } + + void Record(paddle::platform::stream::CUDAStream& stream) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, stream.raw_stream())); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, stream.raw_stream())); +#endif + } + + bool Query() { +#ifdef PADDLE_WITH_HIP + gpuError_t err = hipEventQuery(event_); + if (err == hipSuccess) { + return true; + } + if (err == hipErrorNotReady) { + return false; + } +#else + gpuError_t err = cudaEventQuery(event_); + if (err == cudaSuccess) { + return true; + } + if (err == cudaErrorNotReady) { + return false; + } +#endif + PADDLE_ENFORCE_CUDA_SUCCESS(err); + return false; + } + + void Synchronize() { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventSynchronize(event_)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventSynchronize(event_)); +#endif + } + gpuEvent_t GetRawCudaEvent() { return event_; } + + private: +#ifdef PADDLE_WITH_HIP + unsigned int flags_ = hipEventDefault; +#else + unsigned int flags_ = cudaEventDefault; +#endif + gpuEvent_t event_; +#endif +}; + +static unsigned int get_cuda_flags(bool enable_timing, bool blocking, + bool interprocess) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP + unsigned int flags = + (blocking ? hipEventBlockingSync : hipEventDefault) | + (enable_timing ? hipEventDefault : hipEventDisableTiming) | + (interprocess ? hipEventInterprocess : hipEventDefault); + return flags; +#else + unsigned int flags = + (blocking ? cudaEventBlockingSync : cudaEventDefault) | + (enable_timing ? cudaEventDefault : cudaEventDisableTiming) | + (interprocess ? cudaEventInterprocess : cudaEventDefault); + return flags; +#endif + +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot get the cuda event flags.")); + return 0; +#endif +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index 6c6a47fadb5..2a3d05ed4e3 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -95,6 +96,28 @@ void CUDAStream::Wait() const { PADDLE_ENFORCE_CUDA_SUCCESS(e_sync); } +CUDAStream* get_current_stream(int deviceId) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (deviceId == -1) { + deviceId = platform::GetCurrentDeviceId(); + } + + auto& pool = platform::DeviceContextPool::Instance(); + + platform::Place device = CUDAPlace(deviceId); + + auto stream = static_cast(pool.Get(device)) + ->context() + ->Stream() + .get(); + return stream; +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit cuda current stream.")); + return nullptr; +#endif +} + } // namespace stream } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h index 46bbe94b080..ad54d842d40 100644 --- a/paddle/fluid/platform/stream/cuda_stream.h +++ b/paddle/fluid/platform/stream/cuda_stream.h @@ -33,8 +33,9 @@ enum class Priority : uint8_t { kHigh = 0x1, kNormal = 0x2, }; - +#endif class CUDAStream final { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) public: CUDAStream() = default; explicit CUDAStream(const Place& place, @@ -93,6 +94,37 @@ class CUDAStream final { #endif void Destroy(); + bool Query() const { +#ifdef PADDLE_WITH_HIP + hipError_t err = hipStreamQuery(stream_); + if (err == hipSuccess) { + return true; + } + if (err == hipErrorNotReady) { + return false; + } +#else + cudaError_t err = cudaStreamQuery(stream_); + if (err == cudaSuccess) { + return true; + } + if (err == cudaErrorNotReady) { + return false; + } +#endif + + PADDLE_ENFORCE_CUDA_SUCCESS(err); + return false; + } + + void Synchronize() const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); +#endif + } + private: Place place_; #ifdef PADDLE_WITH_HIP @@ -102,11 +134,11 @@ class CUDAStream final { #endif Priority priority_{Priority::kNormal}; std::unique_ptr> callback_manager_; - +#endif DISABLE_COPY_AND_ASSIGN(CUDAStream); }; -#endif +CUDAStream* get_current_stream(int deviceId); } // namespace stream } // namespace platform diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index f1435f1b916..58471dd04ac 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -57,7 +57,8 @@ set(PYBIND_SRCS inference_api.cc compatible.cc io.cc - generator_py.cc) + generator_py.cc + cuda_streams_py.cc) if(WITH_ASCEND) set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper) diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc new file mode 100644 index 00000000000..df63239cad6 --- /dev/null +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -0,0 +1,351 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/platform/event.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { +void BindCudaStream(py::module *m_ptr) { + auto &m = *m_ptr; + + // Bind Methods + m.def("_get_current_stream", + [](int deviceId) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return paddle::platform::stream::get_current_stream(deviceId); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit cuda current " + "stream.")); +#endif + }, + py::return_value_policy::reference); + + m.def("_device_synchronize", [](int device_id) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_id == -1) { + device_id = paddle::platform::GetCurrentDeviceId(); + } + + int curr_device_id = paddle::platform::GetCurrentDeviceId(); + paddle::platform::SetDeviceId(device_id); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); +#endif + paddle::platform::SetDeviceId(curr_device_id); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit device synchronize.")); +#endif + }); + + py::class_(m, "CUDAStream", R"DOC( + The handle of the CUDA stream. + + Parameters: + device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream. + If device is None or negative integer, device will be the current device. + If device is positive integer, it must less than the device count. Default: None. + + priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal). + If prioriyt is None, the priority is 2(normal). Default: None. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) + s2 = paddle.device.cuda.Stream(0, 1) + s3 = paddle.device.cuda.Stream() + + )DOC") +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + .def("wait_event", + [](paddle::platform::stream::CUDAStream &self, + paddle::platform::CudaEvent &event) { + self.WaitEvent(event.GetRawCudaEvent()); + }, + R"DOC( + Makes all future work submitted to stream wait for all work captured in event. + + Parameters: + event(CUDAEvent): The event to wait on. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) + event = paddle.device.cuda.Event() + s.wait_event(event) + + )DOC") + .def("wait_stream", + [](paddle::platform::stream::CUDAStream &self, + paddle::platform::stream::CUDAStream &stream) { + auto event = paddle::platform::CudaEvent(); + event.Record(stream); + + self.WaitEvent(event.GetRawCudaEvent()); + }, + R"DOC( + Synchronizes with the given stream. + + Parameters: + stream(CUDAStream): The stream to synchronize with. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) + s2 = paddle.device.cuda.Stream(0, 1) + s1.wait_stream(s2) + + )DOC") + .def("query", + [](paddle::platform::stream::CUDAStream &self) { + return self.Query(); + }, + R"DOC( + Return the status whether if all operations in stream have completed. + + Returns: A boolean value. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) + is_done = s.query() + + )DOC") + .def("synchronize", + [](paddle::platform::stream::CUDAStream &self) { + self.Synchronize(); + }, + R"DOC( + Waits for stream tasks to complete. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) + s.synchronize() + + )DOC") + .def("record_event", + [](paddle::platform::stream::CUDAStream &self, + paddle::platform::CudaEvent *event) { + if (event == nullptr) { + auto event_tmp = paddle::platform::CudaEvent(); + event = &event_tmp; + } + event->Record(self); + return event; + }, + R"DOC( + Record a CUDA event in the stream. + + Parameters: + event(CUDAEvent, optional): The event to be record. If event is None, a new event is created. + Default: None. + + Returns: + The recored event. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1) + event = s.record_event() + + )DOC", + py::arg("event") = nullptr) +#endif + .def("__init__", + [](paddle::platform::stream::CUDAStream &self, + platform::CUDAPlace *device, int priority) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (priority != 1 && priority != 2) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Priority should be 1(high) or 2(normal) ")); + } + auto prio = paddle::platform::stream::Priority(priority); + + if (device == nullptr) { + int curr_device_id = platform::GetCurrentDeviceId(); + auto device_tmp = platform::CUDAPlace(curr_device_id); + device = &device_tmp; + } + + new (&self) paddle::platform::stream::CUDAStream(*device, prio); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Class CUDAStream can only be initialized on the GPU platform.")); +#endif + }, + py::arg("device") = nullptr, py::arg("priority") = 2) + .def( + "__init__", + [](paddle::platform::stream::CUDAStream &self, int device, + int priority) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (priority != 1 && priority != 2) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Priority should be 1(high) or 2(normal) ")); + } + auto prio = paddle::platform::stream::Priority(priority); + + int device_count = platform::GetCUDADeviceCount(); + if (device < 0) { + device = platform::GetCurrentDeviceId(); + } + if (device >= device_count) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The device id must be inside [0, %d), but input device=%d.", + device_count, device)); + } + + new (&self) paddle::platform::stream::CUDAStream( + platform::CUDAPlace(device), prio); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Class CUDAStream can only be initialized on the GPU platform.")); +#endif + }, + py::arg("device") = -1, py::arg("priority") = 2) + .def("__init__", [](paddle::platform::stream::CUDAStream &self) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto prio = paddle::platform::stream::Priority::kNormal; + + int device_id = platform::GetCurrentDeviceId(); + + new (&self) paddle::platform::stream::CUDAStream( + platform::CUDAPlace(device_id), prio); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Class CUDAStream can only be initialized on the GPU platform.")); +#endif + }); + + py::class_(m, "CUDAEvent", R"DOC( + The handle of the CUDA event. + + Parameters: + enable_timing(bool, optional): Whether the event will measure time. Default: False. + blocking(bool, optional): Whether the wait() func will be blocking. Default: False; + interprocess(bool, optional): Whether the event can be shared between processes. Defalut: False. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + event = paddle.device.cuda.Event() + + )DOC") +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + .def("record", + [](paddle::platform::CudaEvent &self, + paddle::platform::stream::CUDAStream *stream) { + if (stream == nullptr) { + stream = paddle::platform::stream::get_current_stream(-1); + } + self.Record(*stream); + }, + R"DOC( + Records the event in the given stream. + + Parameters: + stream(CUDAStream, optional): The handle of CUDA stream. If None, the stream is the current stream. Default: None. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + event = paddle.device.cuda.Event() + event.record() + + )DOC", + py::arg("stream") = nullptr) + .def("query", + [](paddle::platform::CudaEvent &self) { return self.Query(); }, + R"DOC( + Queries the event's status. + + Returns: A boolean which indicates all work currently captured by the event has been completed. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + event = paddle.device.cuda.Event() + is_done = event.query() + + )DOC") + .def("synchronize", + [](paddle::platform::CudaEvent &self) { self.Synchronize(); }, R"DOC( + Waits for an event to complete. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + event = paddle.device.cuda.Event() + event.synchronize() + + )DOC") +#endif + .def("__init__", + [](paddle::platform::CudaEvent &self, bool enable_timing, + bool blocking, bool interprocess) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + unsigned int flags = platform::get_cuda_flags( + enable_timing, blocking, interprocess); + new (&self) paddle::platform::CudaEvent(flags); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Class CUDAEvent can only be initialized on the GPU " + "platform.")); + +#endif + }, + py::arg("enable_timing") = false, py::arg("blocking") = false, + py::arg("interprocess") = false); +} + +} // namespace pybind + +} // namespace paddle diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h new file mode 100644 index 00000000000..45c388af85b --- /dev/null +++ b/paddle/fluid/pybind/cuda_streams_py.h @@ -0,0 +1,28 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindCudaStream(py::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e701d3cc16d..4286b9092c2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -70,6 +70,7 @@ limitations under the License. */ #include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/io.h" #ifdef PADDLE_WITH_ASCEND #include "paddle/fluid/pybind/ascend_wrapper_py.h" @@ -77,6 +78,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/box_helper_py.h" #include "paddle/fluid/pybind/compatible.h" #include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/data_set_py.h" #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/fleet_wrapper_py.h" @@ -471,6 +473,8 @@ PYBIND11_MODULE(core_avx, m) { PYBIND11_MODULE(core_noavx, m) { #endif + BindCudaStream(&m); + // Not used, just make sure cpu_info.cc is linked. paddle::platform::CpuTotalPhysicalMemory(); diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 7490a0c29c2..eb998526eb0 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -52,6 +52,7 @@ import paddle.metric # noqa: F401 import paddle.regularizer # noqa: F401 import paddle.incubate # noqa: F401 import paddle.autograd # noqa: F401 +import paddle.device # noqa: F401 import paddle.jit # noqa: F401 import paddle.amp # noqa: F401 diff --git a/python/paddle/device.py b/python/paddle/device/__init__.py similarity index 98% rename from python/paddle/device.py rename to python/paddle/device/__init__.py index b7f69720a6e..0c53097d9ff 100644 --- a/python/paddle/device.py +++ b/python/paddle/device/__init__.py @@ -1,11 +1,11 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,7 +20,7 @@ from paddle.fluid import framework from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.fluid.framework import is_compiled_with_cuda # noqa: F401 from paddle.fluid.framework import is_compiled_with_rocm # noqa: F401 - +from . import cuda __all__ = [ # noqa 'get_cudnn_version', diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py new file mode 100644 index 00000000000..8b5879fe9a4 --- /dev/null +++ b/python/paddle/device/cuda/__init__.py @@ -0,0 +1,96 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import core + +from .streams import Stream # noqa: F401 +from .streams import Event # noqa: F401 + +__all__ = [ + 'Stream', + 'Event', + 'current_stream', + 'synchronize', +] + + +def current_stream(device=None): + ''' + Return the current CUDA stream by the device. + + Parameters: + device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from. + If device is None, the device is the current device. Default: None. + + Returns: + CUDAStream: the stream to the device. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + + s1 = paddle.device.cuda.current_stream() + + s2 = paddle.device.cuda.current_stream(0) + + s3 = paddle.device.cuda.current_stream(paddle.CUDAPlace(0)) + + ''' + + device_id = -1 + + if device is not None: + if isinstance(device, int): + device_id = device + elif isinstance(device, core.CUDAPlace): + device_id = device.get_device_id() + else: + raise ValueError("device type must be int or paddle.CUDAPlace") + + return core._get_current_stream(device_id) + + +def synchronize(device=None): + ''' + Wait for the compute on the given CUDA device to finish. + + Parameters: + device(paddle.CUDAPlace()|int, optional): The device or the ID of the device. + If device is None, the device is the current device. Default: None. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + + paddle.device.cuda.synchronize() + paddle.device.cuda.synchronize(0) + paddle.device.cuda.synchronize(paddle.CUDAPlace(0)) + + ''' + + device_id = -1 + + if device is not None: + if isinstance(device, int): + device_id = device + elif isinstance(device, core.CUDAPlace): + device_id = device.get_device_id() + else: + raise ValueError("device type must be int or paddle.CUDAPlace") + + return core._device_synchronize(device_id) diff --git a/python/paddle/device/cuda/streams.py b/python/paddle/device/cuda/streams.py new file mode 100644 index 00000000000..4efe5005034 --- /dev/null +++ b/python/paddle/device/cuda/streams.py @@ -0,0 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.core import CUDAStream as Stream +from paddle.fluid.core import CUDAEvent as Event diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index ae1a944f7a2..3087a5a4b01 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -274,6 +274,8 @@ if avx_supported(): from .core_avx import _cuda_synchronize from .core_avx import _is_compiled_with_heterps from .core_avx import _promote_types_if_complex_exists + from .core_avx import _device_synchronize + from .core_avx import _get_current_stream if sys.platform != 'win32': from .core_avx import _set_process_pids from .core_avx import _erase_process_pids @@ -323,6 +325,8 @@ if load_noavx: from .core_noavx import _cuda_synchronize from .core_noavx import _is_compiled_with_heterps from .core_noavx import _promote_types_if_complex_exists + from .core_noavx import _device_synchronize + from .core_noavx import _get_current_stream if sys.platform != 'win32': from .core_noavx import _set_process_pids from .core_noavx import _erase_process_pids diff --git a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py new file mode 100644 index 00000000000..04e42c4e5f8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py @@ -0,0 +1,108 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.device import cuda +import paddle + +import unittest + + +class TestCurrentStream(unittest.TestCase): + def test_current_stream(self): + if paddle.is_compiled_with_cuda(): + s = cuda.current_stream() + self.assertTrue(isinstance(s, cuda.Stream)) + + s1 = cuda.current_stream(0) + self.assertTrue(isinstance(s1, cuda.Stream)) + + s2 = cuda.current_stream(paddle.CUDAPlace(0)) + self.assertTrue(isinstance(s2, cuda.Stream)) + + self.assertEqual(s1, s2) + + self.assertRaises(ValueError, cuda.current_stream, "gpu:0") + + +class TestSynchronize(unittest.TestCase): + def test_synchronize(self): + if paddle.is_compiled_with_cuda(): + self.assertIsNone(cuda.synchronize()) + self.assertIsNone(cuda.synchronize(0)) + self.assertIsNone(cuda.synchronize(paddle.CUDAPlace(0))) + + self.assertRaises(ValueError, cuda.synchronize, "gpu:0") + + +class TestCUDAStream(unittest.TestCase): + def test_cuda_stream(self): + if paddle.is_compiled_with_cuda(): + s = paddle.device.cuda.Stream() + self.assertIsNotNone(s) + + def test_cuda_stream_synchronize(self): + if paddle.is_compiled_with_cuda(): + s = paddle.device.cuda.Stream() + e1 = paddle.device.cuda.Event(True, False, False) + e2 = paddle.device.cuda.Event(True, False, False) + + e1.record(s) + e1.query() + tensor1 = paddle.to_tensor(paddle.rand([1000, 1000])) + tensor2 = paddle.matmul(tensor1, tensor1) + s.synchronize() + e2.record(s) + e2.synchronize() + + self.assertTrue(s.query()) + + def test_cuda_stream_wait_event_and_record_event(self): + if paddle.is_compiled_with_cuda(): + s1 = cuda.Stream(0) + tensor1 = paddle.to_tensor(paddle.rand([1000, 1000])) + tensor2 = paddle.matmul(tensor1, tensor1) + e1 = cuda.Event(False, False, False) + s1.record_event(e1) + + s2 = cuda.Stream(0) + s2.wait_event(e1) + s2.synchronize() + + self.assertTrue(e1.query() and s1.query() and s2.query()) + + +class TestCUDAEvent(unittest.TestCase): + def test_cuda_event(self): + if paddle.is_compiled_with_cuda(): + e = paddle.device.cuda.Event(True, False, False) + self.assertIsNotNone(e) + s = paddle.device.cuda.current_stream() + + def test_cuda_event_methods(self): + if paddle.is_compiled_with_cuda(): + e = paddle.device.cuda.Event(True, False, False) + s = paddle.device.cuda.current_stream() + event_query_1 = e.query() + tensor1 = paddle.to_tensor(paddle.rand([1000, 1000])) + tensor2 = paddle.matmul(tensor1, tensor1) + s.record_event(e) + e.synchronize() + event_query_2 = e.query() + + self.assertTrue(event_query_1) + self.assertTrue(event_query_2) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index ba7ea88dd43..0db6c0c27d7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -224,6 +224,8 @@ packages=['paddle', 'paddle.tensor', 'paddle.onnx', 'paddle.autograd', + 'paddle.device', + 'paddle.device.cuda', ] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: -- GitLab