diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index d38118d2a260a21745565f02a31acda5af334772..98c2e92f2c38e9daf4f29873a983f4acd35ef60a 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -442,95 +442,6 @@ const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -class EigenCudaStreamDevice : public Eigen::StreamInterface { - public: - EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { - Eigen::initializeDeviceProp(); - } - ~EigenCudaStreamDevice() override {} - - void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) { - stream_ = cuda_stream; - place_ = place; - device_prop_ = &Eigen::m_deviceProperties[place.device]; - } - - const gpuStream_t& stream() const override { return *stream_; } - -#ifdef PADDLE_WITH_HIP - const hipDeviceProp_t& deviceProperties() const override { -#else - const cudaDeviceProp& deviceProperties() const override { -#endif - return *device_prop_; - } - - void* allocate(size_t num_bytes) const override { - if (UNLIKELY(num_bytes == 0)) { - return nullptr; - } - auto buf = memory::Alloc(place_, num_bytes); - VLOG(4) << "Eigen allocated at " << buf->ptr() << ", size" << buf->size() - << " requested " << num_bytes; - void* retv = buf->ptr(); - { - std::lock_guard lock(mtx_); - allocations_.emplace(retv, std::move(buf)); - } - return retv; - } - - void deallocate(void* buffer) const override { - if (LIKELY(buffer)) { - std::lock_guard lock(mtx_); - allocations_.erase(buffer); - } - } - - void* scratchpad() const override { - if (scratch_ == NULL) { - scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - unsigned int* semaphore() const override { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + Eigen::kGpuScratchSize; - semaphore_ = reinterpret_cast(scratch); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); -#endif - } - return semaphore_; - } - - private: - CUDAPlace place_; - const gpuStream_t* stream_; // not owned; -#ifdef PADDLE_WITH_HIP - const hipDeviceProp_t* device_prop_; -#else - const cudaDeviceProp* device_prop_; // not owned; -#endif - mutable void* scratch_; - mutable unsigned int* semaphore_; - mutable std::mutex mtx_; // to protect allocations_ - mutable std::unordered_map allocations_; -}; - -void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { - if (required_workspace_bytes <= WorkspaceSize()) { - return; - } - // reset allocation first before re-allocate to save memory - allocation_.reset(); - allocation_ = memory::Alloc(device_context_, required_workspace_bytes); -} CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 6d08a0cc32b632aea7dac48a08b8093b4a1507ba..4bb1e3abf8a5850b846f902bd053236b16b23f78 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -268,58 +268,6 @@ struct DefaultDeviceContextType { #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -class CudnnWorkspaceHandle; -class EigenCudaStreamDevice; - -class CudnnWorkspaceHandle { - public: - inline CudnnWorkspaceHandle(const phi::GPUContext& dev_ctx, std::mutex* mtx) - : device_context_(dev_ctx), mtx_(mtx) {} - - template - inline void RunFunc(Callback&& cudnn_func, size_t required_workspace_bytes) { - if (required_workspace_bytes > WorkspaceSize()) { - ReallocWorkspace(required_workspace_bytes); - } - VLOG(2) << "Cudnn workspace size at RunFunc: " - << static_cast(WorkspaceSize()) / (1 << 20) << " MB"; - { - std::lock_guard guard(*mtx_); - cudnn_func(allocation_ ? allocation_->ptr() : nullptr); - } - } - - /*! \brief Thread which call RunFuncSync() would release gpu memory after - * running the function. Currently this function is only used when cudnn - * exhaustive searching and callers have to guarantee that the input function - * is host blocking */ - template - inline void RunFuncSync(Callback&& cudnn_func, - size_t required_workspace_bytes) { - RunFunc(cudnn_func, required_workspace_bytes); - ResetWorkspace(); - } - - void ReallocWorkspace(size_t required_workspace_bytes); - - inline void ResetWorkspace() { allocation_ = nullptr; } - - inline size_t WorkspaceSize() { - if (allocation_ == nullptr) { - return 0; - } - return allocation_->size(); - } - - CudnnWorkspaceHandle(CudnnWorkspaceHandle&&) = default; - CudnnWorkspaceHandle& operator=(CudnnWorkspaceHandle&&) = delete; - - private: - memory::allocation::AllocationPtr allocation_; - const phi::GPUContext& device_context_; - std::mutex* mtx_; -}; - template <> struct DefaultDeviceContextType { using TYPE = phi::GPUContext;