Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
1280f294
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1280f294
编写于
5月 13, 2022
作者:
W
Wilber
提交者:
GitHub
5月 13, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add gpu resources. (#42723)
上级
757b5d31
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
876 addition
and
262 deletion
+876
-262
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+2
-2
paddle/fluid/inference/api/infer_context.cc
paddle/fluid/inference/api/infer_context.cc
+17
-0
paddle/fluid/inference/api/infer_context.h
paddle/fluid/inference/api/infer_context.h
+46
-0
paddle/fluid/inference/api/resource_manager.cc
paddle/fluid/inference/api/resource_manager.cc
+290
-0
paddle/fluid/inference/api/resource_manager.h
paddle/fluid/inference/api/resource_manager.h
+109
-0
paddle/phi/backends/gpu/CMakeLists.txt
paddle/phi/backends/gpu/CMakeLists.txt
+2
-1
paddle/phi/backends/gpu/gpu_context.cc
paddle/phi/backends/gpu/gpu_context.cc
+84
-259
paddle/phi/backends/gpu/gpu_context.h
paddle/phi/backends/gpu/gpu_context.h
+4
-0
paddle/phi/backends/gpu/gpu_resources.cc
paddle/phi/backends/gpu/gpu_resources.cc
+271
-0
paddle/phi/backends/gpu/gpu_resources.h
paddle/phi/backends/gpu/gpu_resources.h
+51
-0
未找到文件。
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
1280f294
...
...
@@ -50,10 +50,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
endif
()
if
(
WITH_ONNXRUNTIME
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc
${
mkldnn_quantizer_src
}
DEPS
${
inference_deps
}
cc_library
(
analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc
resource_manager.cc infer_context.cc
${
mkldnn_quantizer_src
}
DEPS
${
inference_deps
}
zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx
)
else
(
WITH_ONNXRUNTIME
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc
${
mkldnn_quantizer_src
}
DEPS
${
inference_deps
}
cc_library
(
analysis_predictor SRCS analysis_predictor.cc
resource_manager.cc infer_context.cc
${
mkldnn_quantizer_src
}
DEPS
${
inference_deps
}
zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils
)
endif
(
WITH_ONNXRUNTIME
)
...
...
paddle/fluid/inference/api/infer_context.cc
0 → 100644
浏览文件 @
1280f294
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/infer_context.h"
namespace
paddle
{}
// namespace paddle
paddle/fluid/inference/api/infer_context.h
0 → 100644
浏览文件 @
1280f294
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/all_context.h"
namespace
paddle
{
class
InferCPUContext
:
public
phi
::
CPUContext
{
public:
using
phi
::
CPUContext
::
SetEigenDevice
;
};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
InferGPUContext
:
public
phi
::
GPUContext
{
public:
using
phi
::
GPUContext
::
SetStream
;
using
phi
::
GPUContext
::
SetEigenDevice
;
using
phi
::
GPUContext
::
SetBlasHandle
;
using
phi
::
GPUContext
::
SetBlasTensorCoreHandle
;
using
phi
::
GPUContext
::
SetBlasTF32Handle
;
using
phi
::
GPUContext
::
SetDnnHandle
;
using
phi
::
GPUContext
::
SetSolverHandle
;
using
phi
::
GPUContext
::
SetSparseHandle
;
// using phi::GPUContext::SetDnnWorkspaceHandle;
using
phi
::
GPUContext
::
SetComputeCapability
;
using
phi
::
GPUContext
::
SetMaxThreadsPerMultiProcessor
;
using
phi
::
GPUContext
::
SetMultiProcessors
;
using
phi
::
GPUContext
::
SetMaxThreadsPerBlock
;
using
phi
::
GPUContext
::
SetMaxGridDimSize
;
using
phi
::
GPUContext
::
SetDriverVersion
;
using
phi
::
GPUContext
::
SetRuntimeVersion
;
};
#endif
}
// namespace paddle
paddle/fluid/inference/api/resource_manager.cc
0 → 100644
浏览文件 @
1280f294
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/resource_manager.h"
#include <unordered_map>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/backends/gpu/forwards.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_resources.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/generator.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace
paddle
{
namespace
internal
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
EigenGpuStreamDevice
:
public
Eigen
::
StreamInterface
{
public:
EigenGpuStreamDevice
()
:
scratch_
(
nullptr
),
semaphore_
(
nullptr
)
{
Eigen
::
initializeDeviceProp
();
}
~
EigenGpuStreamDevice
()
override
{}
void
Reinitialize
(
gpuStream_t
cuda_stream
,
phi
::
Allocator
*
allocator
,
GPUPlace
place
)
{
stream_
=
cuda_stream
;
allocator_
=
allocator
;
device_prop_
=
&
Eigen
::
m_deviceProperties
[
place
.
device
];
}
const
gpuStream_t
&
stream
()
const
override
{
return
stream_
;
}
const
gpuDeviceProp
&
deviceProperties
()
const
override
{
return
*
device_prop_
;
}
void
*
allocate
(
size_t
num_bytes
)
const
override
{
if
(
UNLIKELY
(
num_bytes
==
0
))
{
return
nullptr
;
}
auto
buf
=
allocator_
->
Allocate
(
num_bytes
);
VLOG
(
4
)
<<
"Eigen allocated at "
<<
buf
->
ptr
()
<<
" requested "
<<
num_bytes
;
void
*
retv
=
buf
->
ptr
();
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
allocations_
.
emplace
(
retv
,
std
::
move
(
buf
));
}
return
retv
;
}
void
deallocate
(
void
*
buffer
)
const
override
{
if
(
LIKELY
(
buffer
))
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
allocations_
.
erase
(
buffer
);
}
}
void
*
scratchpad
()
const
override
{
if
(
scratch_
==
NULL
)
{
scratch_
=
allocate
(
Eigen
::
kGpuScratchSize
+
sizeof
(
unsigned
int
));
}
return
scratch_
;
}
unsigned
int
*
semaphore
()
const
override
{
if
(
semaphore_
==
NULL
)
{
char
*
scratch
=
static_cast
<
char
*>
(
scratchpad
())
+
Eigen
::
kGpuScratchSize
;
semaphore_
=
reinterpret_cast
<
unsigned
int
*>
(
scratch
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
stream_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
stream_
));
#endif
}
return
semaphore_
;
}
private:
gpuStream_t
stream_
;
// not owned;
phi
::
Allocator
*
allocator_
;
// not owned;
const
gpuDeviceProp
*
device_prop_
;
// not owned;
mutable
void
*
scratch_
;
mutable
unsigned
int
*
semaphore_
;
mutable
std
::
mutex
mtx_
;
// to protect allocations_
mutable
std
::
unordered_map
<
void
*
,
phi
::
Allocator
::
AllocationPtr
>
allocations_
;
};
#endif
}
// namespace internal
ResourceManager
::
ResourceManager
(
const
phi
::
Place
&
place
,
void
*
stream
)
:
place_
(
place
)
{
InitCPUResource
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
InitGPUResource
(
stream
);
#endif
}
ResourceManager
::~
ResourceManager
()
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DestroyGPUResource
();
#endif
}
void
ResourceManager
::
InitCPUResource
()
{
cpu_eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
Eigen
::
DefaultDevice
*
ResourceManager
::
GetCpuEigenDevice
()
{
return
cpu_eigen_device_
.
get
();
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
ResourceManager
::
InitGPUResource
(
void
*
stream
)
{
if
(
stream
==
nullptr
)
{
owned_stream_
=
true
;
phi
::
InitStream
(
&
stream_
);
}
else
{
owned_stream_
=
false
;
stream_
=
reinterpret_cast
<
gpuStream_t
>
(
stream
);
}
InitGpuProperties
();
InitGpuEigenDevice
();
InitDnnHanlde
();
InitBlasHandle
();
InitBlasLtHandle
();
InitSolverHandle
();
InitSparseHandle
();
}
void
ResourceManager
::
DestroyGPUResource
()
{
if
(
owned_stream_
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
stream_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
stream_
));
#endif
stream_
=
nullptr
;
}
DestroyDnnHandle
();
DestroyBlasHandle
();
DestroyBlasLtHandle
();
DestroySolverHandle
();
DestroySparseHandle
();
}
void
ResourceManager
::
InitGpuProperties
()
{
phi
::
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
phi
::
InitGpuProperties
(
place_
,
&
compute_capability_
,
&
runtime_version_
,
&
driver_version_
,
&
multi_process_
,
&
max_threads_per_mp_
,
&
max_threads_per_block_
,
&
max_grid_dim_size_
);
}
void
ResourceManager
::
InitGpuEigenDevice
()
{
auto
*
allocator
=
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place_
)
.
get
();
eigen_stream_
.
reset
(
new
internal
::
EigenGpuStreamDevice
());
eigen_stream_
->
Reinitialize
(
stream_
,
allocator
,
place_
);
gpu_eigen_device_
.
reset
(
new
Eigen
::
GpuDevice
(
eigen_stream_
.
get
()));
}
void
ResourceManager
::
InitDnnHanlde
()
{
phi
::
InitDnnHandle
(
&
dnn_handle_
,
stream_
,
place_
);
}
void
ResourceManager
::
DestroyDnnHandle
()
{
phi
::
DestroyDnnHandle
(
dnn_handle_
);
}
void
ResourceManager
::
InitBlasHandle
()
{
phi
::
InitBlasHandle
(
&
blas_handle_
,
stream_
);
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 9000
phi
::
InitBlasHandle
(
&
blas_tensor_core_handle_
,
stream_
);
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tensor_core_handle_
,
CUBLAS_TENSOR_OP_MATH
));
#endif
#if CUDA_VERSION >= 11000
phi
::
InitBlasHandle
(
&
blas_tf32_tensor_core_handle_
,
stream_
);
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tf32_tensor_core_handle_
,
CUBLAS_TF32_TENSOR_OP_MATH
));
#endif
#endif
}
void
ResourceManager
::
DestroyBlasHandle
()
{
phi
::
DestroyBlasHandle
(
blas_handle_
);
phi
::
DestroyBlasHandle
(
blas_tensor_core_handle_
);
phi
::
DestroyBlasHandle
(
blas_tf32_tensor_core_handle_
);
}
void
ResourceManager
::
InitBlasLtHandle
()
{
phi
::
InitBlasLtHandle
(
&
blaslt_handle_
);
}
void
ResourceManager
::
DestroyBlasLtHandle
()
{
phi
::
DestroyBlasLtHandle
(
blaslt_handle_
);
}
void
ResourceManager
::
InitSolverHandle
()
{
phi
::
InitSolverHandle
(
&
solver_handle_
,
stream_
);
}
void
ResourceManager
::
DestroySolverHandle
()
{
phi
::
DestroySolverHandle
(
solver_handle_
);
}
void
ResourceManager
::
InitSparseHandle
()
{
phi
::
InitSparseHandle
(
&
sparse_handle_
,
stream_
);
}
void
ResourceManager
::
DestroySparseHandle
()
{
phi
::
DestroySparseHandle
(
sparse_handle_
);
}
gpuStream_t
ResourceManager
::
GetStream
()
const
{
return
stream_
;
}
dnnHandle_t
ResourceManager
::
GetDnnHandle
()
const
{
return
dnn_handle_
;
}
blasHandle_t
ResourceManager
::
GetBlasHandle
()
const
{
return
blas_handle_
;
}
blasHandle_t
ResourceManager
::
GetBlasTensorCoreHandle
()
const
{
return
blas_tensor_core_handle_
;
}
blasHandle_t
ResourceManager
::
GetBlasTF32Handle
()
const
{
return
blas_tf32_tensor_core_handle_
;
}
blasLtHandle_t
ResourceManager
::
GetBlasLtHandle
()
const
{
return
blaslt_handle_
;
}
phi
::
solverHandle_t
ResourceManager
::
GetSolverDnHandle
()
const
{
return
solver_handle_
;
}
phi
::
sparseHandle_t
ResourceManager
::
GetSparseHandle
()
const
{
return
sparse_handle_
;
}
Eigen
::
GpuDevice
*
ResourceManager
::
GetGpuEigenDevice
()
const
{
return
gpu_eigen_device_
.
get
();
}
int
ResourceManager
::
GetGpuComputeCapability
()
const
{
return
compute_capability_
;
}
int
ResourceManager
::
GetGpuRuntimeVersion
()
const
{
return
runtime_version_
;
}
int
ResourceManager
::
GetGpuDriverVersion
()
const
{
return
driver_version_
;
}
int
ResourceManager
::
GetGPUMultiProcessors
()
const
{
return
multi_process_
;
}
int
ResourceManager
::
GetGpuMaxThreadsPerMp
()
const
{
return
max_threads_per_mp_
;
}
int
ResourceManager
::
GetGpuMaxThreadsPerBlock
()
const
{
return
max_threads_per_block_
;
}
std
::
array
<
int
,
3
>
ResourceManager
::
GetGpuMaxGridDimSize
()
const
{
return
max_grid_dim_size_
;
}
#endif
}
// namespace paddle
paddle/fluid/inference/api/resource_manager.h
0 → 100644
浏览文件 @
1280f294
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/backends/cpu/forwards.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/gpu/forwards.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_resources.h"
#endif
namespace
paddle
{
namespace
internal
{
class
EigenGpuStreamDevice
;
}
// namespace internal
class
ResourceManager
{
public:
explicit
ResourceManager
(
const
phi
::
Place
&
place
,
void
*
stream
);
~
ResourceManager
();
public:
Eigen
::
DefaultDevice
*
GetCpuEigenDevice
();
private:
void
InitCPUResource
();
private:
phi
::
Place
place_
;
std
::
unique_ptr
<
Eigen
::
DefaultDevice
>
cpu_eigen_device_
;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
public:
gpuStream_t
GetStream
()
const
;
dnnHandle_t
GetDnnHandle
()
const
;
blasHandle_t
GetBlasHandle
()
const
;
blasHandle_t
GetBlasTensorCoreHandle
()
const
;
blasHandle_t
GetBlasTF32Handle
()
const
;
blasLtHandle_t
GetBlasLtHandle
()
const
;
phi
::
solverHandle_t
GetSolverDnHandle
()
const
;
phi
::
sparseHandle_t
GetSparseHandle
()
const
;
Eigen
::
GpuDevice
*
GetGpuEigenDevice
()
const
;
int
GetGpuComputeCapability
()
const
;
int
GetGpuRuntimeVersion
()
const
;
int
GetGpuDriverVersion
()
const
;
int
GetGPUMultiProcessors
()
const
;
int
GetGpuMaxThreadsPerMp
()
const
;
int
GetGpuMaxThreadsPerBlock
()
const
;
std
::
array
<
int
,
3
>
GetGpuMaxGridDimSize
()
const
;
private:
void
InitGPUResource
(
void
*
stream
);
void
DestroyGPUResource
();
void
InitGpuProperties
();
void
InitGpuEigenDevice
();
void
InitDnnHanlde
();
void
DestroyDnnHandle
();
void
InitBlasHandle
();
void
DestroyBlasHandle
();
void
InitBlasLtHandle
();
void
DestroyBlasLtHandle
();
void
InitSolverHandle
();
void
DestroySolverHandle
();
void
InitSparseHandle
();
void
DestroySparseHandle
();
private:
int
compute_capability_
;
int
runtime_version_
;
int
driver_version_
;
int
multi_process_
;
int
max_threads_per_mp_
;
int
max_threads_per_block_
;
std
::
array
<
int
,
3
>
max_grid_dim_size_
;
bool
owned_stream_
{
true
};
gpuStream_t
stream_
;
std
::
unique_ptr
<
Eigen
::
GpuDevice
>
gpu_eigen_device_
;
std
::
unique_ptr
<
internal
::
EigenGpuStreamDevice
>
eigen_stream_
;
blasHandle_t
blas_handle_
{
nullptr
};
blasHandle_t
blas_tensor_core_handle_
{
nullptr
};
blasHandle_t
blas_tf32_tensor_core_handle_
{
nullptr
};
blasLtHandle_t
blaslt_handle_
{
nullptr
};
dnnHandle_t
dnn_handle_
{
nullptr
};
phi
::
solverHandle_t
solver_handle_
{
nullptr
};
phi
::
sparseHandle_t
sparse_handle_
{
nullptr
};
// DnnWorkspaceHandle
#endif
};
}
// namespace paddle
paddle/phi/backends/gpu/CMakeLists.txt
浏览文件 @
1280f294
...
...
@@ -6,4 +6,5 @@ elseif(WITH_ROCM)
hip_library
(
phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda
)
endif
()
cc_library
(
gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3
)
cc_library
(
gpu_resources SRCS gpu_resources.cc DEPS phi_device_context phi_gpu_info
)
cc_library
(
gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3 gpu_resources
)
paddle/phi/backends/gpu/gpu_context.cc
浏览文件 @
1280f294
...
...
@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_resources.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
...
...
@@ -202,27 +203,65 @@ struct GPUContext::Impl {
void
Init
()
{
owned_
=
true
;
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
InitGpuProperties
();
InitStream
();
phi
::
InitGpuProperties
(
place_
,
&
compute_capability_
,
&
runtime_version_
,
&
driver_version_
,
&
multi_process_
,
&
max_threads_per_mp_
,
&
max_threads_per_block_
,
&
max_grid_dim_size_
);
phi
::
InitStream
(
&
stream_
);
InitEigenDevice
();
InitBlasHandle
();
InitBlasLtHandle
();
InitDNNHandle
();
InitSolverHandle
();
InitSparseHandle
();
phi
::
InitBlasHandle
(
&
blas_handle_
,
stream_
);
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 9000
phi
::
InitBlasHandle
(
&
blas_tensor_core_handle_
,
stream_
);
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tensor_core_handle_
,
CUBLAS_TENSOR_OP_MATH
));
#endif
#if CUDA_VERSION >= 11000
phi
::
InitBlasHandle
(
&
blas_tf32_tensor_core_handle_
,
stream_
);
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tf32_tensor_core_handle_
,
CUBLAS_TF32_TENSOR_OP_MATH
));
#endif
#endif
phi
::
InitBlasLtHandle
(
&
blaslt_handle_
);
phi
::
InitDnnHandle
(
&
dnn_handle_
,
stream_
,
place_
);
phi
::
InitSolverHandle
(
&
solver_handle_
,
stream_
);
phi
::
InitSparseHandle
(
&
sparse_handle_
,
stream_
);
InitDnnWorkspace
();
}
void
PartialInitWithoutAllocator
()
{
owned_
=
true
;
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
InitGpuProperties
();
InitStream
();
InitBlasHandle
();
InitBlasLtHandle
();
InitDNNHandle
();
InitSolverHandle
();
InitSparseHandle
();
phi
::
InitGpuProperties
(
place_
,
&
compute_capability_
,
&
runtime_version_
,
&
driver_version_
,
&
multi_process_
,
&
max_threads_per_mp_
,
&
max_threads_per_block_
,
&
max_grid_dim_size_
);
phi
::
InitStream
(
&
stream_
);
phi
::
InitBlasHandle
(
&
blas_handle_
,
stream_
);
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 9000
phi
::
InitBlasHandle
(
&
blas_tensor_core_handle_
,
stream_
);
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tensor_core_handle_
,
CUBLAS_TENSOR_OP_MATH
));
#endif
#if CUDA_VERSION >= 11000
phi
::
InitBlasHandle
(
&
blas_tf32_tensor_core_handle_
,
stream_
);
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tf32_tensor_core_handle_
,
CUBLAS_TF32_TENSOR_OP_MATH
));
#endif
#endif
phi
::
InitBlasLtHandle
(
&
blaslt_handle_
);
phi
::
InitDnnHandle
(
&
dnn_handle_
,
stream_
,
place_
);
phi
::
InitSolverHandle
(
&
solver_handle_
,
stream_
);
phi
::
InitSparseHandle
(
&
sparse_handle_
,
stream_
);
}
void
PartialInitWithAllocator
()
{
...
...
@@ -238,19 +277,23 @@ struct GPUContext::Impl {
~
Impl
()
{
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
DestoryInternalWorkspace
();
DestoryInternalEigenDevice
();
DestroyInternalSparseHandle
();
DestroyInternalSolverHandle
();
DestroyInternalDnnHandle
();
if
(
owned_
)
{
DestoryInternalWorkspace
();
DestoryInternalEigenDevice
();
phi
::
DestroySparseHandle
(
sparse_handle_
);
phi
::
DestroySolverHandle
(
solver_handle_
);
phi
::
DestroyDnnHandle
(
dnn_handle_
);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nccl_comm_
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
ncclCommDestroy
(
nccl_comm_
));
}
if
(
nccl_comm_
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
ncclCommDestroy
(
nccl_comm_
));
}
#endif
DestroyInternalBlasHandle
();
DestroyInternalBlasLtHandle
();
DestoryInternalStream
();
phi
::
DestroyBlasHandle
(
blas_handle_
);
phi
::
DestroyBlasHandle
(
blas_tensor_core_handle_
);
phi
::
DestroyBlasHandle
(
blas_tf32_tensor_core_handle_
);
phi
::
DestroyBlasLtHandle
(
blaslt_handle_
);
phi
::
DestoryStream
(
stream_
);
}
}
const
Place
&
GetPlace
()
const
{
return
place_
;
}
...
...
@@ -259,73 +302,6 @@ struct GPUContext::Impl {
return
blas_tensor_core_handle_
!=
nullptr
;
}
void
InitGpuProperties
()
{
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
GetDeviceId
());
compute_capability_
=
backends
::
gpu
::
GetGPUComputeCapability
(
place_
.
GetDeviceId
());
multi_process_
=
backends
::
gpu
::
GetGPUMultiProcessors
(
place_
.
GetDeviceId
());
max_threads_per_mp_
=
backends
::
gpu
::
GetGPUMaxThreadsPerMultiProcessor
(
place_
.
GetDeviceId
());
max_grid_dim_size_
=
backends
::
gpu
::
GetGpuMaxGridDimSize
(
place_
.
GetDeviceId
());
max_threads_per_block_
=
backends
::
gpu
::
GetGPUMaxThreadsPerBlock
(
place_
.
GetDeviceId
());
driver_version_
=
backends
::
gpu
::
GetGPUDriverVersion
(
place_
.
GetDeviceId
());
runtime_version_
=
backends
::
gpu
::
GetGPURuntimeVersion
(
place_
.
GetDeviceId
());
// TODO(wilber): glog may be replaced in the future?
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Please NOTE: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
", GPU Compute Capability: "
<<
compute_capability_
/
10
<<
"."
<<
compute_capability_
%
10
<<
", Driver API Version: "
<<
driver_version_
/
1000
<<
"."
<<
(
driver_version_
%
100
)
/
10
<<
", Runtime API Version: "
<<
runtime_version_
/
1000
<<
"."
<<
(
runtime_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
cudnn_dso_ver
=
(
miopen_major
*
1000
+
miopen_minor
*
10
+
miopen_patch
)
/
10
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
10
;
if
(
cudnn_dso_ver
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
100
<<
"."
<<
compile_miopen_version
%
100
<<
", but MIOPEN version in your machine is "
<<
cudnn_dso_ver
/
100
<<
"."
<<
cudnn_dso_ver
%
100
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
#else
size_t
cudnn_dso_ver
=
dynload
::
cudnnGetVersion
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
", cuDNN Version: "
<<
cudnn_dso_ver
/
1000
<<
"."
<<
(
cudnn_dso_ver
%
1000
)
/
100
<<
"."
;
// Check CUDA/CUDNN version compatiblity
auto
local_cuda_version
=
(
driver_version_
/
1000
)
*
10
+
(
driver_version_
%
100
)
/
10
;
auto
compile_cuda_version
=
(
CUDA_VERSION
/
1000
)
*
10
+
(
CUDA_VERSION
%
100
)
/
10
;
if
(
local_cuda_version
<
compile_cuda_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
". The installed Paddle is compiled with CUDA "
<<
compile_cuda_version
/
10
<<
"."
<<
compile_cuda_version
%
10
<<
", but CUDA runtime version in your machine is "
<<
local_cuda_version
/
10
<<
"."
<<
local_cuda_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible CUDA "
"version."
;
}
#endif
}
void
InitDnnWorkspace
()
{
PD_CHECK
(
allocator_
!=
nullptr
,
"the device allocator for gpu context is nullptr."
);
...
...
@@ -350,27 +326,6 @@ struct GPUContext::Impl {
return
DnnWorkspaceHandle
(
allocator_
,
stream_
);
}
void
InitStream
()
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreateWithPriority
(
&
stream_
,
hipStreamDefault
,
0
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreateWithPriority
(
&
stream_
,
cudaStreamDefault
,
0
));
#endif
}
void
DestoryInternalStream
()
{
if
(
owned_
&&
stream_
!=
nullptr
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
stream_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
stream_
));
#endif
}
stream_
=
nullptr
;
}
void
SetStream
(
gpuStream_t
stream
)
{
stream_
=
stream
;
}
gpuStream_t
GetStream
()
const
{
...
...
@@ -400,55 +355,6 @@ struct GPUContext::Impl {
return
eigen_device_
;
}
void
InitBlasHandle
()
{
#ifdef PADDLE_WITH_HIP
phi
::
dynload
::
rocblas_create_handle
(
&
blas_handle_
);
phi
::
dynload
::
rocblas_set_stream
(
blas_handle_
,
stream_
);
#else // PADDLE_WITH_CUDA
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasCreate
(
&
blas_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetStream
(
blas_handle_
,
stream_
));
#if CUDA_VERSION >= 9000
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasCreate
(
&
blas_tensor_core_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetStream
(
blas_tensor_core_handle_
,
stream_
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tensor_core_handle_
,
CUBLAS_TENSOR_OP_MATH
));
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasCreate
(
&
blas_tf32_tensor_core_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetStream
(
blas_tf32_tensor_core_handle_
,
stream_
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetMathMode
(
blas_tf32_tensor_core_handle_
,
CUBLAS_TF32_TENSOR_OP_MATH
));
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 9000
#endif // PADDLE_WITH_HIP
}
void
DestroyInternalBlasHandle
()
{
#ifdef PADDLE_WITH_HIP
if
(
owned_
&&
blas_handle_
!=
nullptr
)
{
phi
::
dynload
::
rocblas_destroy_handle
(
blas_handle_
);
blas_handle_
=
nullptr
;
}
#else
if
(
owned_
&&
blas_handle_
!=
nullptr
)
{
phi
::
dynload
::
cublasDestroy
(
blas_handle_
);
blas_handle_
=
nullptr
;
}
if
(
owned_
&&
blas_tensor_core_handle_
!=
nullptr
)
{
phi
::
dynload
::
cublasDestroy
(
blas_tensor_core_handle_
);
blas_tensor_core_handle_
=
nullptr
;
}
if
(
owned_
&&
blas_tf32_tensor_core_handle_
!=
nullptr
)
{
phi
::
dynload
::
cublasDestroy
(
blas_tf32_tensor_core_handle_
);
blas_tf32_tensor_core_handle_
=
nullptr
;
}
#endif // PADDLE_WITH_HIP
}
blasHandle_t
GetBlasHandle
()
const
{
PD_CHECK
(
blas_handle_
!=
nullptr
,
"the gpu blas handle is nullptr."
);
return
blas_handle_
;
...
...
@@ -456,16 +362,12 @@ struct GPUContext::Impl {
void
SetBlasHandle
(
blasHandle_t
blas
)
{
blas_handle_
=
blas
;
}
void
InitBlasLtHandle
()
{
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
phi
::
dynload
::
cublasLtCreate
(
&
blaslt_handle_
);
#endif
void
SetBlasTensorCoreHandle
(
blasHandle_t
handle
)
{
blas_tensor_core_handle_
=
handle
;
}
void
DestroyInternalBlasLtHandle
()
{
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
phi
::
dynload
::
cublasLtDestroy
(
blaslt_handle_
);
#endif
void
SetBlasTF32Handle
(
blasHandle_t
handle
)
{
blas_tf32_tensor_core_handle_
=
handle
;
}
void
SetBlasLtHandle
(
blasLtHandle_t
blaslt
)
{
blaslt_handle_
=
blaslt
;
}
...
...
@@ -475,53 +377,6 @@ struct GPUContext::Impl {
return
blaslt_handle_
;
}
void
InitDNNHandle
()
{
if
(
phi
::
dynload
::
HasCUDNN
())
{
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
local_miopen_version
=
(
miopen_major
*
1000
+
miopen_minor
*
10
+
miopen_patch
)
/
10
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
10
;
if
(
local_miopen_version
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
100
<<
"."
<<
compile_miopen_version
%
100
<<
", but MIOPEN version in your machine is "
<<
local_miopen_version
/
100
<<
"."
<<
local_miopen_version
%
100
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenCreate
(
&
dnn_handle_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenSetStream
(
dnn_handle_
,
stream_
));
#else
auto
local_cudnn_version
=
phi
::
dynload
::
cudnnGetVersion
()
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
if
(
local_cudnn_version
<
static_cast
<
size_t
>
(
compile_cudnn_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with CUDNN "
<<
compile_cudnn_version
/
10
<<
"."
<<
compile_cudnn_version
%
10
<<
", but CUDNN version in your machine is "
<<
local_cudnn_version
/
10
<<
"."
<<
local_cudnn_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible CUDNN "
"version."
;
}
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cudnnCreate
(
&
dnn_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cudnnSetStream
(
dnn_handle_
,
stream_
));
#endif
}
else
{
dnn_handle_
=
nullptr
;
}
}
dnnHandle_t
GetDnnHandle
()
{
PD_CHECK
(
dnn_handle_
!=
nullptr
,
"the gpu dnn handle is nullptr."
);
return
dnn_handle_
;
...
...
@@ -543,24 +398,6 @@ struct GPUContext::Impl {
void
SetDnnHandle
(
dnnHandle_t
handle
)
{
dnn_handle_
=
handle
;
}
void
InitSolverHandle
()
{
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cusolverDnCreate
(
&
solver_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cusolverDnSetStream
(
solver_handle_
,
stream_
));
#endif
}
void
DestroyInternalSolverHandle
()
{
#ifndef PADDLE_WITH_HIP
if
(
owned_
&&
solver_handle_
!=
nullptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
cusolverDnDestroy
(
solver_handle_
));
solver_handle_
=
nullptr
;
}
#endif
}
solverHandle_t
GetSolverHandle
()
const
{
PD_CHECK
(
solver_handle_
!=
nullptr
,
"the gpu solver handle is nullptr."
);
return
solver_handle_
;
...
...
@@ -568,29 +405,6 @@ struct GPUContext::Impl {
void
SetSolverHandle
(
solverHandle_t
handle
)
{
solver_handle_
=
handle
;
}
void
InitSparseHandle
()
{
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseCreate
(
&
sparse_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseSetStream
(
sparse_handle_
,
stream_
));
#endif
#endif
}
void
DestroyInternalSparseHandle
()
{
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
if
(
owned_
&&
sparse_handle_
!=
nullptr
)
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseDestroy
(
sparse_handle_
));
sparse_handle_
=
nullptr
;
}
#endif
#endif
}
sparseHandle_t
GetSparseHandle
()
const
{
PD_CHECK
(
sparse_handle_
!=
nullptr
,
"the gpu sparse handle is nullptr."
);
return
sparse_handle_
;
...
...
@@ -878,7 +692,10 @@ void GPUContext::Init() {
impl_
->
Init
();
}
void
GPUContext
::
SetStream
(
gpuStream_t
stream
)
{
impl_
->
SetStream
(
stream
);
}
void
GPUContext
::
SetStream
(
gpuStream_t
stream
)
{
impl_
->
allocator_
=
const_cast
<
Allocator
*>
(
&
this
->
GetAllocator
());
impl_
->
SetStream
(
stream
);
}
void
GPUContext
::
SetEigenDevice
(
Eigen
::
GpuDevice
*
device
)
{
impl_
->
SetEigenDevice
(
device
);
...
...
@@ -888,6 +705,14 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
impl_
->
SetBlasHandle
(
blas
);
}
void
GPUContext
::
SetBlasTensorCoreHandle
(
blasHandle_t
handle
)
{
impl_
->
SetBlasTensorCoreHandle
(
handle
);
}
void
GPUContext
::
SetBlasTF32Handle
(
blasHandle_t
handle
)
{
impl_
->
SetBlasTF32Handle
(
handle
);
}
void
GPUContext
::
SetBlasLtHandle
(
blasLtHandle_t
blaslt
)
{
impl_
->
SetBlasLtHandle
(
blaslt
);
}
...
...
paddle/phi/backends/gpu/gpu_context.h
浏览文件 @
1280f294
...
...
@@ -199,6 +199,10 @@ class PADDLE_API GPUContext : public DeviceContext {
void
SetBlasHandle
(
blasHandle_t
);
void
SetBlasTensorCoreHandle
(
blasHandle_t
);
void
SetBlasTF32Handle
(
blasHandle_t
);
void
SetBlasLtHandle
(
blasLtHandle_t
);
void
SetDnnHandle
(
dnnHandle_t
);
...
...
paddle/phi/backends/gpu/gpu_resources.cc
0 → 100644
浏览文件 @
1280f294
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_resources.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/dynload/cublas.h"
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/dynload/cusparse.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/phi/backends/dynload/nccl.h"
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#endif // PADDLE_WITH_CUDA
#include "unsupported/Eigen/CXX11/Tensor"
// TODO(phi): remove fluid header.
#include "paddle/fluid/platform/enforce.h"
namespace
phi
{
void
InitGpuProperties
(
Place
place
,
int
*
compute_capability
,
int
*
runtime_version
,
int
*
driver_version
,
int
*
multi_process
,
int
*
max_threads_per_mp
,
int
*
max_threads_per_block
,
std
::
array
<
int
,
3
>*
max_grid_dim_size
)
{
backends
::
gpu
::
GPUDeviceGuard
guard
(
place
.
GetDeviceId
());
*
compute_capability
=
backends
::
gpu
::
GetGPUComputeCapability
(
place
.
GetDeviceId
());
*
multi_process
=
backends
::
gpu
::
GetGPUMultiProcessors
(
place
.
GetDeviceId
());
*
max_threads_per_mp
=
backends
::
gpu
::
GetGPUMaxThreadsPerMultiProcessor
(
place
.
GetDeviceId
());
*
max_grid_dim_size
=
backends
::
gpu
::
GetGpuMaxGridDimSize
(
place
.
GetDeviceId
());
*
max_threads_per_block
=
backends
::
gpu
::
GetGPUMaxThreadsPerBlock
(
place
.
GetDeviceId
());
*
driver_version
=
backends
::
gpu
::
GetGPUDriverVersion
(
place
.
GetDeviceId
());
*
runtime_version
=
backends
::
gpu
::
GetGPURuntimeVersion
(
place
.
GetDeviceId
());
// TODO(wilber): glog may be replaced in the future?
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Please NOTE: device: "
<<
static_cast
<
int
>
(
place
.
device
)
<<
", GPU Compute Capability: "
<<
*
compute_capability
/
10
<<
"."
<<
*
compute_capability
%
10
<<
", Driver API Version: "
<<
*
driver_version
/
1000
<<
"."
<<
(
*
driver_version
%
100
)
/
10
<<
", Runtime API Version: "
<<
*
runtime_version
/
1000
<<
"."
<<
(
*
runtime_version
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
cudnn_dso_ver
=
(
miopen_major
*
1000
+
miopen_minor
*
10
+
miopen_patch
)
/
10
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
10
;
if
(
cudnn_dso_ver
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
static_cast
<
int
>
(
place
.
device
)
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
100
<<
"."
<<
compile_miopen_version
%
100
<<
", but MIOPEN version in your machine is "
<<
cudnn_dso_ver
/
100
<<
"."
<<
cudnn_dso_ver
%
100
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
#else
size_t
cudnn_dso_ver
=
dynload
::
cudnnGetVersion
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
static_cast
<
int
>
(
place
.
device
)
<<
", cuDNN Version: "
<<
cudnn_dso_ver
/
1000
<<
"."
<<
(
cudnn_dso_ver
%
1000
)
/
100
<<
"."
;
// Check CUDA/CUDNN version compatiblity
auto
local_cuda_version
=
(
*
driver_version
/
1000
)
*
10
+
(
*
driver_version
%
100
)
/
10
;
auto
compile_cuda_version
=
(
CUDA_VERSION
/
1000
)
*
10
+
(
CUDA_VERSION
%
100
)
/
10
;
if
(
local_cuda_version
<
compile_cuda_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
static_cast
<
int
>
(
place
.
device
)
<<
". The installed Paddle is compiled with CUDA "
<<
compile_cuda_version
/
10
<<
"."
<<
compile_cuda_version
%
10
<<
", but CUDA runtime version in your machine is "
<<
local_cuda_version
/
10
<<
"."
<<
local_cuda_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible CUDA "
"version."
;
}
#endif
}
void
InitStream
(
gpuStream_t
*
stream
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreateWithPriority
(
stream
,
hipStreamDefault
,
0
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreateWithPriority
(
stream
,
cudaStreamDefault
,
0
));
#endif
}
void
DestoryStream
(
gpuStream_t
stream
)
{
if
(
stream
!=
nullptr
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
stream
));
#endif
}
stream
=
nullptr
;
}
void
InitBlasHandle
(
blasHandle_t
*
blas_handle
,
gpuStream_t
stream
)
{
#ifdef PADDLE_WITH_HIP
phi
::
dynload
::
rocblas_create_handle
(
blas_handle
);
phi
::
dynload
::
rocblas_set_stream
(
*
blas_handle
,
stream
);
#else // PADDLE_WITH_CUDA
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasCreate
(
blas_handle
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cublasSetStream
(
*
blas_handle
,
stream
));
#endif // PADDLE_WITH_HIP
}
void
DestroyBlasHandle
(
blasHandle_t
handle
)
{
#ifdef PADDLE_WITH_HIP
if
(
handle
!=
nullptr
)
{
phi
::
dynload
::
rocblas_destroy_handle
(
handle
);
handle
=
nullptr
;
}
#else
if
(
handle
!=
nullptr
)
{
phi
::
dynload
::
cublasDestroy
(
handle
);
handle
=
nullptr
;
}
#endif // PADDLE_WITH_HIP
}
void
InitBlasLtHandle
(
blasLtHandle_t
*
blaslt_handle
)
{
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
phi
::
dynload
::
cublasLtCreate
(
blaslt_handle
);
#endif
}
void
DestroyBlasLtHandle
(
blasLtHandle_t
handle
)
{
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
if
(
handle
!=
nullptr
)
{
phi
::
dynload
::
cublasLtDestroy
(
handle
);
handle
=
nullptr
;
}
#endif
}
void
InitDnnHandle
(
dnnHandle_t
*
handle
,
gpuStream_t
stream
,
Place
place
)
{
if
(
phi
::
dynload
::
HasCUDNN
())
{
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
local_miopen_version
=
(
miopen_major
*
1000
+
miopen_minor
*
10
+
miopen_patch
)
/
10
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
10
;
if
(
local_miopen_version
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place
.
device
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
100
<<
"."
<<
compile_miopen_version
%
100
<<
", but MIOPEN version in your machine is "
<<
local_miopen_version
/
100
<<
"."
<<
local_miopen_version
%
100
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenCreate
(
handle
));
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenSetStream
(
*
handle
,
stream
));
#else
auto
local_cudnn_version
=
phi
::
dynload
::
cudnnGetVersion
()
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
if
(
local_cudnn_version
<
static_cast
<
size_t
>
(
compile_cudnn_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place
.
device
<<
". The installed Paddle is compiled with CUDNN "
<<
compile_cudnn_version
/
10
<<
"."
<<
compile_cudnn_version
%
10
<<
", but CUDNN version in your machine is "
<<
local_cudnn_version
/
10
<<
"."
<<
local_cudnn_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible CUDNN "
"version."
;
}
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cudnnCreate
(
handle
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cudnnSetStream
(
*
handle
,
stream
));
#endif
}
else
{
*
handle
=
nullptr
;
}
}
void
DestroyDnnHandle
(
dnnHandle_t
handle
)
{
#ifdef PADDLE_WITH_HIP
if
(
handle
!=
nullptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenDestroy
(
handle
));
handle
=
nullptr
;
}
#else
if
(
handle
!=
nullptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
cudnnDestroy
(
handle
));
handle
=
nullptr
;
}
#endif // PADDLE_WITH_HIP
}
void
InitSolverHandle
(
solverHandle_t
*
handle
,
gpuStream_t
stream
)
{
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cusolverDnCreate
(
handle
));
PADDLE_RETRY_CUDA_SUCCESS
(
phi
::
dynload
::
cusolverDnSetStream
(
*
handle
,
stream
));
#endif
}
void
DestroySolverHandle
(
solverHandle_t
solver_handle
)
{
#ifndef PADDLE_WITH_HIP
if
(
solver_handle
!=
nullptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
cusolverDnDestroy
(
solver_handle
));
solver_handle
=
nullptr
;
}
#endif
}
void
InitSparseHandle
(
sparseHandle_t
*
handle
,
gpuStream_t
stream
)
{
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseCreate
(
handle
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseSetStream
(
*
handle
,
stream
));
#endif
#endif
}
void
DestroySparseHandle
(
sparseHandle_t
handle
)
{
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
if
(
handle
!=
nullptr
)
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseDestroy
(
handle
));
handle
=
nullptr
;
}
#endif
#endif
}
}
// namespace phi
paddle/phi/backends/gpu/gpu_resources.h
0 → 100644
浏览文件 @
1280f294
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <array>
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/common/place.h"
namespace
phi
{
void
InitGpuProperties
(
Place
place
,
int
*
compute_capability
,
int
*
runtime_version
,
int
*
driver_version
,
int
*
multi_process
,
int
*
max_threads_per_mp
,
int
*
max_threads_per_block
,
std
::
array
<
int
,
3
>*
max_grid_dim_size
);
void
InitStream
(
gpuStream_t
*
stream
);
void
DestoryStream
(
gpuStream_t
stream
);
void
InitBlasHandle
(
blasHandle_t
*
blas_handle
,
gpuStream_t
stream
);
void
DestroyBlasHandle
(
blasHandle_t
handle
);
void
InitBlasLtHandle
(
blasLtHandle_t
*
blaslt_handle
);
void
DestroyBlasLtHandle
(
blasLtHandle_t
handle
);
void
InitDnnHandle
(
dnnHandle_t
*
handle
,
gpuStream_t
stream
,
Place
place
);
void
DestroyDnnHandle
(
dnnHandle_t
handle
);
void
InitSolverHandle
(
solverHandle_t
*
handle
,
gpuStream_t
stream
);
void
DestroySolverHandle
(
solverHandle_t
solver_handle
);
void
InitSparseHandle
(
sparseHandle_t
*
handle
,
gpuStream_t
stream
);
void
DestroySparseHandle
(
sparseHandle_t
handle
);
// void InitDnnWorkspace();
}
// namespace phi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录