From 95fe318e3ee19004419eb5aff09bca7ddaacad46 Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 15 Aug 2017 14:08:20 +0800 Subject: [PATCH] init --- Dockerfile | 14 ------ cmake/flags.cmake | 7 --- paddle/platform/CMakeLists.txt | 2 +- paddle/platform/device_context.cc | 79 +++++++++++++++++++++++++------ paddle/platform/device_context.h | 12 +++-- 5 files changed, 74 insertions(+), 40 deletions(-) diff --git a/Dockerfile b/Dockerfile index da00471025..98f61ba586 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,20 +71,6 @@ RUN pip install -r /root/requirements.txt RUN apt-get install -y libssl-dev libffi-dev RUN pip install certifi urllib3[secure] -# TODO(qijun) The template library Eigen doesn't work well with GCC 5 -# coming with the default Docker image, so we switch to use GCC 4.8 -# by default. And I will check Eigen library later. - -RUN ln -sf gcc-4.8 /usr/bin/gcc && \ - ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \ - ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \ - ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \ - ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \ - ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \ - ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \ - ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \ - ln -sf g++-4.8 /usr/bin/g++ && \ - ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++ # Install woboq_codebrowser to /woboq RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ diff --git a/cmake/flags.cmake b/cmake/flags.cmake index b27eb71550..47bb83b00a 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -9,13 +9,6 @@ function(CheckCompilerCXX11Flag) if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") endif() - if(NOT ANDROID) - # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem. - # Use Debug mode instead for now. - if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) - set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE) - endif() - endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" # Apple Clang is a different compiler than upstream Clang which havs different version numbers. diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 4154aad15c..c1ad60d160 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -16,5 +16,5 @@ ELSE() set(GPU_CTX_DEPS) ENDIF() -cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS}) +cc_library(device_context SRCS device_context.cc DEPS memory place eigen3 ${GPU_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index a928e09778..dc345bdd57 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/platform/device_context.h" +#include "paddle/memory/memory.h" namespace paddle { namespace platform { @@ -36,6 +37,59 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } #ifndef PADDLE_ONLY_CPU +class EigenCudaStreamDevice : public Eigen::StreamInterface { + public: + EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { + Eigen::initializeDeviceProp(); + } + ~EigenCudaStreamDevice() override {} + + void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) { + stream_ = cuda_stream; + place_ = place; + device_prop_ = &Eigen::m_deviceProperties[place.device]; + } + + const cudaStream_t& stream() const override { return *stream_; } + + const cudaDeviceProp& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + paddle::memory::Alloc(place_, num_bytes); + } + + void deallocate(void* buffer) const override { + paddle::memory::Free(place_, buffer); + } + + void* scratchpad() const override { + if (scratch_ == NULL) { + scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + unsigned int* semaphore() const override { + if (semaphore_ == NULL) { + char* scratch = + static_cast(scratchpad()) + Eigen::kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + PADDLE_ENFORCE( + cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); + } + return semaphore_; + } + + private: + GPUPlace place_; + const cudaStream_t* stream_; // not owned; + const cudaDeviceProp* device_prop_; // not owned; + mutable char* scratch_; + mutable unsigned int* semaphore_; +}; + template <> Eigen::GpuDevice* DeviceContext::get_eigen_device() const { return reinterpret_cast(this)->eigen_device(); @@ -43,19 +97,9 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device() const { CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { SetDeviceId(place_.device); - // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly - // here will cause segment fault. We must implement a class derived from - // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id - // later. Please refer to the implementation of class EigenCudaStreamDevice - // in TensorFlow. - // - // We find that CUDA 7 introduces a new option, the per-thread default stream, - // that has two effects. Please refer to https://devblogs.nvidia.com/ - // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ - // - // So, we decide to use default stream and add –default-stream per-thread nvcc - // flag. Than, two threads with two CUDADeviceContexts will run parallelly. - eigen_stream_.reset(new Eigen::CudaStreamDevice()); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + eigen_stream_.reset(new EigenCudaStreamDevice()); + eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); } @@ -75,12 +119,13 @@ CUDADeviceContext::~CUDADeviceContext() { } eigen_stream_.reset(); eigen_device_.reset(); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { - PADDLE_ENFORCE(cudaStreamSynchronize(0)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); } Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { @@ -91,6 +136,7 @@ cublasHandle_t CUDADeviceContext::cublas_handle() { if (!cublas_handle_) { SetDeviceId(place_.device); PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); } return cublas_handle_; } @@ -99,10 +145,13 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() { if (!cudnn_handle_) { SetDeviceId(place_.device); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnnHandle_t, stream_)); } return cudnn_handle_; } +cudaStream_t CUDADeviceContext::stream() { return stream_; } + curandGenerator_t CUDADeviceContext::curand_generator() { if (!curand_generator_) { SetDeviceId(place_.device); @@ -110,6 +159,8 @@ curandGenerator_t CUDADeviceContext::curand_generator() { CURAND_RNG_PSEUDO_DEFAULT)); PADDLE_ENFORCE( dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_)); + + PADDLE_ENFORCE(dynload::curandSetStream(curandGenerator_t, stream_)); } return curand_generator_; } diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 08b5b2cff9..b68e177c0a 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -76,6 +76,9 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return curand handle in the device context. */ curandGenerator_t curand_generator(); + + /*! \brief Return cuda stream in the device context. */ + cudaStream_t stream(); // clang-format on private: @@ -83,15 +86,16 @@ class CUDADeviceContext : public DeviceContext { private: std::unique_ptr eigen_device_; - std::unique_ptr eigen_stream_; + std::unique_ptr eigen_stream_; private: uint64_t seed_; // clang-format off - cudnnHandle_t cudnn_handle_ = nullptr; - cublasHandle_t cublas_handle_ = nullptr; - curandGenerator_t curand_generator_ = nullptr; + cudaStream_t stream_{nullptr} + cudnnHandle_t cudnn_handle_{nullptr}; + cublasHandle_t cublas_handle_{nullptr}; + curandGenerator_t curand_generator_{nullptr}; // clang-format on }; -- GitLab