未验证 提交 d5f0ed4b 编写于 作者: W Wilber 提交者: GitHub

update (#44418)

上级 130c108a
......@@ -302,11 +302,8 @@ void AnalysisPredictor::InitPlace() {
place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (config_.thread_local_stream_enabled()) {
auto *ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place_));
VLOG(3) << "The prediction process will be completed using a separate "
"normal-priority stream on each thread.";
ctx->ResetThreadContext(platform::stream::Priority::kNormal);
LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
"Please use config.SetExecStream instead.";
}
#endif
} else if (config_.use_xpu()) {
......@@ -1621,14 +1618,8 @@ bool AnalysisPredictor::ZeroCopyRun() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
if (stream != nullptr) {
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto gpu_place = place_;
auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place));
dev_ctx->SetThreadLocalStream(stream);
}
LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
"Please use config.SetExecStream instead.";
return ZeroCopyRun();
}
#endif
......
......@@ -534,74 +534,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}
thread_local std::unordered_map<const CUDADeviceContext*,
std::shared_ptr<CUDAContext>>
CUDADeviceContext::thread_ctx_;
thread_local std::mutex CUDADeviceContext::ctx_mtx_;
void CUDAContext::InitEigenContext() {
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&RawStream(), place_);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}
CUDAContext::CUDAContext(const CUDAPlace& place,
const stream::Priority& priority,
const stream::StreamFlag& flag) {
place_ = place;
CUDADeviceGuard guard(place_.device);
stream_.reset(new stream::CUDAStream(place, priority, flag));
InitEigenContext();
InitCuBlasContext();
InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
InitCuBlasLtContext();
#endif
InitCuSparseContext();
InitCuSolverContext();
#endif
}
void CUDAContext::SetStream(gpuStream_t stream) {
if (stream_->raw_stream() != stream) {
CUDADeviceGuard guard(place_.device);
DestoryCuDNNContext();
DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
DestoryCuBlasLtContext();
#endif
DestoryCuSolverContext();
#endif
stream_->SetStream(stream);
InitEigenContext();
InitCuBlasContext();
InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
InitCuBlasLtContext();
#endif
InitCuSolverContext();
#endif
}
}
CUDAContext::~CUDAContext() {
CUDADeviceGuard guard(place_.device);
DestoryCuDNNContext();
DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
InitCuBlasLtContext();
#endif
DestoryCuSparseContext();
DestoryCuSolverContext();
#endif
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
phi::GPUContext::PartialInitWithoutAllocator();
cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
......@@ -609,123 +541,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
CUDADeviceContext::~CUDADeviceContext() = default;
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
if (thread_ctx_.count(this)) {
return context()->EigenDevice().get();
}
return phi::GPUContext::eigen_device();
}
void CUDADeviceContext::Wait() const {
VLOG(4) << "CUDA context(" << this << ") Wait";
if (thread_ctx_.count(this)) {
context()->Stream()->Wait();
return;
}
phi::GPUContext::Wait();
}
#ifdef PADDLE_WITH_HIP
miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
#endif
if (thread_ctx_.count(this)) {
return context()->CudnnHandle();
}
return phi::GPUContext::cudnn_handle();
}
#ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const {
if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return phi::GPUContext::cublas_handle();
}
#else
cublasHandle_t CUDADeviceContext::cublas_handle() const {
if (thread_ctx_.count(this)) {
return context()->CublasHandle()->GetCublasHandle();
}
return phi::GPUContext::cublas_handle();
}
#if CUDA_VERSION >= 11060
cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const {
if (thread_ctx_.count(this)) {
return context()->CublasLtHandle()->GetCublasLtHandle();
}
return phi::GPUContext::cublaslt_handle();
}
#endif
cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
if (thread_ctx_.count(this)) {
return context()->CusparseHandle()->GetCusparseHandle();
}
return phi::GPUContext::cusparse_handle();
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
if (thread_ctx_.count(this)) {
return context()->CusolverDnHandle();
}
return phi::GPUContext::cusolver_dn_handle();
}
#endif
void CUDADeviceContext::RecordEvent(
gpuEvent_t ev, const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->RecordEvent(ev, callback);
return;
}
phi::GPUContext::RecordEvent(ev, callback);
}
void CUDADeviceContext::AddStreamCallback(
const std::function<void()>& callback) const {
if (thread_ctx_.count(this)) {
context()->Stream()->AddCallback(callback);
return;
}
phi::GPUContext::AddStreamCallback(callback);
}
void CUDADeviceContext::WaitStreamCallback() const {
if (thread_ctx_.count(this)) {
context()->Stream()->WaitCallback();
return;
}
phi::GPUContext::WaitStreamCallback();
}
phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
if (thread_ctx_.count(this)) {
// return workspace_.get();
return phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(GetPlace())
.get(),
stream());
}
return phi::GPUContext::cudnn_workspace_handle();
}
gpuStream_t CUDADeviceContext::stream() const {
if (thread_ctx_.count(this)) {
return context()->RawStream();
}
return phi::GPUContext::stream();
}
std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
if (!thread_ctx_.count(this)) {
PADDLE_THROW(platform::errors::PermissionDenied(
"CUDADeviceContext call context() failed, make sure in the "
"thread_local semantic."));
}
return thread_ctx_.at(this);
}
stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
return cuda_stream_.get();
}
......
......@@ -274,366 +274,17 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
class CudnnWorkspaceHandle;
class EigenCudaStreamDevice;
class CUDAContext {
public:
CUDAContext() = default;
explicit CUDAContext(
const CUDAPlace& place,
const stream::Priority& priority = stream::Priority::kNormal,
const stream::StreamFlag& flag = stream::StreamFlag::kDefaultFlag);
~CUDAContext();
const CUDAPlace& Place() const { return place_; }
const std::unique_ptr<Eigen::GpuDevice>& EigenDevice() const {
return eigen_device_;
}
const std::unique_ptr<EigenCudaStreamDevice>& EigenStream() const {
return eigen_stream_;
}
const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; }
stream::CUDAStream* SetStream(stream::CUDAStream* new_stream_ptr) {
auto* old_stream_ptr = stream_.release();
stream_.reset(new_stream_ptr);
return old_stream_ptr;
}
void SetStream(gpuStream_t stream);
const gpuStream_t& RawStream() { return stream_->raw_stream(); }
#ifdef PADDLE_WITH_HIP
const miopenHandle_t& CudnnHandle() const { return cudnn_handle_; }
#else
const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; }
#endif
#ifndef PADDLE_WITH_HIP
const cusolverDnHandle_t& CusolverDnHandle() const {
return cusolver_dn_handle_;
}
#endif
const std::unique_ptr<CublasHandleHolder>& CublasHandle() const {
return cublas_handle_;
}
const std::unique_ptr<CublasHandleHolder>& CublasTensorCoreHandle() const {
return cublas_tensor_core_handle_;
}
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
const std::unique_ptr<CublasLtHandleHolder>& CublasLtHandle() const {
return cublaslt_handle_;
}
#endif
const std::unique_ptr<CusparseHandleHolder>& CusparseHandle() const {
return cusparse_handle_;
}
#endif
/*! \brief Call cublas function safely. */
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tf32_tensor_core_handle_) {
cublas_tf32_tensor_core_handle_->Call(callback);
} else {
cublas_handle_->Call(callback);
}
}
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
/*! \brief Call cublasLt function safely. */
inline void CublasLtCall(
const std::function<void(blasLtHandle_t)>& callback) const {
cublaslt_handle_->Call(callback);
}
#endif
/*! \brief Call cusparse function safely. */
inline void CusparseCall(
const std::function<void(phi::sparseHandle_t)>& callback) const {
cusparse_handle_->Call(callback);
}
#endif
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (cublas_tensor_core_handle_) {
cublas_tensor_core_handle_->Call(callback);
} else {
cublas_handle_->Call(callback);
}
}
private:
void InitEigenContext();
#ifdef PADDLE_WITH_HIP
void InitCuBlasContext() {
cublas_handle_.reset(new CublasHandleHolder(RawStream()));
}
#else
void InitCuBlasContext() {
cublas_handle_.reset(
new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH));
if (TensorCoreAvailable()) {
#if CUDA_VERSION >= 9000
cublas_tensor_core_handle_.reset(
new CublasHandleHolder(RawStream(), CUBLAS_TENSOR_OP_MATH));
#if CUDA_VERSION >= 11000
cublas_tf32_tensor_core_handle_.reset(
new CublasHandleHolder(RawStream(), CUBLAS_TF32_TENSOR_OP_MATH));
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 9000
}
}
#endif
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
void InitCuBlasLtContext() {
cublaslt_handle_.reset(new CublasLtHandleHolder());
}
#endif
void InitCuSparseContext() {
cusparse_handle_.reset(new CusparseHandleHolder(RawStream()));
}
#endif
void InitCuDNNContext() {
if (dynload::HasCUDNN()) {
#ifdef PADDLE_WITH_HIP
size_t miopen_major, miopen_minor, miopen_patch;
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
&miopen_major, &miopen_minor, &miopen_patch));
auto local_miopen_version =
(miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
auto compile_miopen_version = MIOPEN_VERSION / 10;
if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with MIOPEN "
<< compile_miopen_version / 100 << "."
<< compile_miopen_version % 100
<< ", but MIOPEN version in your machine is "
<< local_miopen_version / 100 << "." << local_miopen_version % 100
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible MIOPEN "
"version.";
}
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::miopenSetStream(cudnn_handle_, RawStream()));
#else
auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
<< compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
<< ", but CUDNN version in your machine is "
<< local_cudnn_version / 10 << "." << local_cudnn_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, RawStream()));
#endif
} else {
cudnn_handle_ = nullptr;
}
}
#ifndef PADDLE_WITH_HIP
void InitCuSolverContext() {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
}
#endif
void DestoryCuDNNContext() {
if (cudnn_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
#endif
}
cudnn_handle_ = nullptr;
}
void DestoryCuBlasContext() {
cublas_handle_.reset();
cublas_tensor_core_handle_.reset();
cublas_tf32_tensor_core_handle_.reset();
}
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
void DestoryCuBlasLtContext() { cublaslt_handle_.reset(); }
#endif
void DestoryCuSparseContext() { cusparse_handle_.reset(); }
#endif
#ifndef PADDLE_WITH_HIP
void DestoryCuSolverContext() {
if (cusolver_dn_handle_) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cusolverDnDestroy(cusolver_dn_handle_));
}
}
#endif
CUDAPlace place_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<stream::CUDAStream> stream_;
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle_;
#else
cudnnHandle_t cudnn_handle_;
#endif
std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
std::unique_ptr<CublasLtHandleHolder> cublaslt_handle_;
#endif
cusolverDnHandle_t cusolver_dn_handle_;
std::unique_ptr<CusparseHandleHolder> cusparse_handle_;
#endif
DISABLE_COPY_AND_ASSIGN(CUDAContext);
};
class CUDADeviceContext : public phi::GPUContext {
public:
explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext();
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Call cublas function safely. */
inline void CublasCall(
const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
phi::GPUContext::CublasCall(callback);
return;
}
return context()->CublasCall(callback);
}
#ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */
inline void CusparseCall(
const std::function<void(phi::sparseHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
phi::GPUContext::CusparseCall(callback);
return;
}
context()->CusparseCall(callback);
}
#endif
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
inline void TensorCoreCublasCallIfAvailable(
const std::function<void(blasHandle_t)>& callback) const {
if (!thread_ctx_.count(this)) {
phi::GPUContext::TensorCoreCublasCallIfAvailable(callback);
return;
}
context()->TensorCoreCublasCallIfAvailable(callback);
}
/*! \brief Return cudnn handle in the device context. */
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle() const;
#else
cudnnHandle_t cudnn_handle() const;
#endif
/*! \brief Return cublas handle in the device context. */
#ifdef PADDLE_WITH_HIP
rocblas_handle cublas_handle() const;
#else
cublasHandle_t cublas_handle() const;
cublasLtHandle_t cublaslt_handle() const;
cusparseHandle_t cusparse_handle() const;
#endif
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
* would be acquired to prevent other threads from accessing the
* workspace. Once the handle is destructed, the lock would be released.
* CudnnWorkspaceHandle is an RAII object to implement thread-safe
* sequential cudnn function calls. */
phi::DnnWorkspaceHandle cudnn_workspace_handle() const;
/*! \brief Return cuda stream in the device context. */
gpuStream_t stream() const;
void RecordEvent(gpuEvent_t ev, const std::function<void()>& callback) const;
void AddStreamCallback(const std::function<void()>& callback) const;
void WaitStreamCallback() const;
void ResetThreadContext(const stream::Priority& priority) {
std::lock_guard<std::mutex> guard(ctx_mtx_);
thread_ctx_[this].reset(new CUDAContext(this->GetPlace(), priority));
}
std::shared_ptr<CUDAContext> context() const;
// Note: Can only be used under thread_local semantics.
void SetThreadLocalStream(const gpuStream_t stream) {
thread_ctx_.at(this)->SetStream(stream);
}
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
stream::CUDAStream* GetCudaStream() const;
stream::CUDAStream* SetCudaStream(stream::CUDAStream*);
private:
// The thread_local static variable will be released before the
// global static variable, so avoid using it in dtor.
static thread_local std::unordered_map<const CUDADeviceContext*,
std::shared_ptr<CUDAContext>>
thread_ctx_;
static thread_local std::mutex ctx_mtx_;
mutable std::mutex cudnn_handle_mtx_;
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
std::unique_ptr<stream::CUDAStream> cuda_stream_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册