Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
a821c4a9
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a821c4a9
编写于
2月 06, 2022
作者:
W
Wilber
提交者:
GitHub
2月 06, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[PTEN] Add Gpu context (#39305)
上级
dcff7fa8
变更
135
显示空白变更内容
内联
并排
Showing
135 changed file
with
4438 addition
and
770 deletion
+4438
-770
paddle/fluid/distributed/common/utils.h
paddle/fluid/distributed/common/utils.h
+1
-1
paddle/fluid/distributed/ps/service/communicator/communicator.cc
...fluid/distributed/ps/service/communicator/communicator.cc
+4
-4
paddle/fluid/distributed/ps/service/communicator/communicator.h
.../fluid/distributed/ps/service/communicator/communicator.h
+2
-2
paddle/fluid/framework/data_type_transform_test.cu
paddle/fluid/framework/data_type_transform_test.cu
+4
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+1
-1
paddle/fluid/framework/pten_utils.h
paddle/fluid/framework/pten_utils.h
+7
-0
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+3
-3
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+20
-0
paddle/fluid/imperative/gloo_context.cc
paddle/fluid/imperative/gloo_context.cc
+11
-0
paddle/fluid/inference/lite/test_engine_lite.cc
paddle/fluid/inference/lite/test_engine_lite.cc
+4
-0
paddle/fluid/inference/tensorrt/test_engine.cc
paddle/fluid/inference/tensorrt/test_engine.cc
+12
-0
paddle/fluid/memory/allocation/best_fit_allocator_test.cu
paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+5
-0
paddle/fluid/memory/malloc_test.cu
paddle/fluid/memory/malloc_test.cu
+31
-4
paddle/fluid/operators/arg_min_max_op_base.cu.h
paddle/fluid/operators/arg_min_max_op_base.cu.h
+1
-1
paddle/fluid/operators/argsort_op.cu
paddle/fluid/operators/argsort_op.cu
+2
-2
paddle/fluid/operators/broadcast_tensors_op.cu
paddle/fluid/operators/broadcast_tensors_op.cu
+2
-2
paddle/fluid/operators/cholesky_solve_op.cu
paddle/fluid/operators/cholesky_solve_op.cu
+2
-1
paddle/fluid/operators/clip_by_norm_op.cu
paddle/fluid/operators/clip_by_norm_op.cu
+1
-1
paddle/fluid/operators/controlflow/compare_all_op.cu
paddle/fluid/operators/controlflow/compare_all_op.cu
+2
-1
paddle/fluid/operators/copy_cross_scope_test.cc
paddle/fluid/operators/copy_cross_scope_test.cc
+8
-0
paddle/fluid/operators/elementwise/elementwise_mul_op.cu
paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+3
-2
paddle/fluid/operators/elementwise/elementwise_op_function.h
paddle/fluid/operators/elementwise/elementwise_op_function.h
+2
-1
paddle/fluid/operators/feed_forward_test.cu
paddle/fluid/operators/feed_forward_test.cu
+12
-0
paddle/fluid/operators/fused/attn_gemm.h
paddle/fluid/operators/fused/attn_gemm.h
+2
-1
paddle/fluid/operators/gelu_op.cu
paddle/fluid/operators/gelu_op.cu
+2
-2
paddle/fluid/operators/graph_khop_sampler_op.cu
paddle/fluid/operators/graph_khop_sampler_op.cu
+2
-2
paddle/fluid/operators/graph_send_recv_op.cu
paddle/fluid/operators/graph_send_recv_op.cu
+2
-2
paddle/fluid/operators/gumbel_softmax_op.cu
paddle/fluid/operators/gumbel_softmax_op.cu
+1
-1
paddle/fluid/operators/index_sample_op.cu
paddle/fluid/operators/index_sample_op.cu
+3
-3
paddle/fluid/operators/interpolate_v2_op.cu
paddle/fluid/operators/interpolate_v2_op.cu
+4
-4
paddle/fluid/operators/kron_op.h
paddle/fluid/operators/kron_op.h
+2
-2
paddle/fluid/operators/kthvalue_op.cu
paddle/fluid/operators/kthvalue_op.cu
+1
-1
paddle/fluid/operators/lite/lite_engine_op_test.cc
paddle/fluid/operators/lite/lite_engine_op_test.cc
+4
-0
paddle/fluid/operators/margin_cross_entropy_op.cu
paddle/fluid/operators/margin_cross_entropy_op.cu
+2
-2
paddle/fluid/operators/math/beam_search_test.cc
paddle/fluid/operators/math/beam_search_test.cc
+79
-0
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+577
-0
paddle/fluid/operators/math/blas_impl.hip.h
paddle/fluid/operators/math/blas_impl.hip.h
+406
-0
paddle/fluid/operators/math/concat_test.cc
paddle/fluid/operators/math/concat_test.cc
+27
-0
paddle/fluid/operators/math/cusparse_conversion_api_test.cc
paddle/fluid/operators/math/cusparse_conversion_api_test.cc
+10
-0
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+162
-0
paddle/fluid/operators/math/inclusive_scan.h
paddle/fluid/operators/math/inclusive_scan.h
+1
-1
paddle/fluid/operators/math/math_function.cu
paddle/fluid/operators/math/math_function.cu
+13
-0
paddle/fluid/operators/math/math_function_test.cc
paddle/fluid/operators/math/math_function_test.cc
+1
-0
paddle/fluid/operators/math/math_function_test.cu
paddle/fluid/operators/math/math_function_test.cu
+36
-0
paddle/fluid/operators/math/vol2col_test.cc
paddle/fluid/operators/math/vol2col_test.cc
+120
-1
paddle/fluid/operators/mean_op.cu
paddle/fluid/operators/mean_op.cu
+2
-1
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+6
-1
paddle/fluid/operators/p_norm_op.cu
paddle/fluid/operators/p_norm_op.cu
+8
-4
paddle/fluid/operators/pool_op.h
paddle/fluid/operators/pool_op.h
+2
-2
paddle/fluid/operators/prelu_op.cu
paddle/fluid/operators/prelu_op.cu
+2
-1
paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+4
-0
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+4
-2
paddle/fluid/operators/renorm_op.cu
paddle/fluid/operators/renorm_op.cu
+6
-4
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+6
-3
paddle/fluid/operators/scatter.cu.h
paddle/fluid/operators/scatter.cu.h
+1
-2
paddle/fluid/operators/solve_op.h
paddle/fluid/operators/solve_op.h
+2
-1
paddle/fluid/operators/strided_memcpy_test.cc
paddle/fluid/operators/strided_memcpy_test.cc
+9
-0
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+12
-0
paddle/fluid/operators/top_k_function_cuda.h
paddle/fluid/operators/top_k_function_cuda.h
+1
-1
paddle/fluid/operators/trace_op.cu
paddle/fluid/operators/trace_op.cu
+2
-1
paddle/fluid/operators/triangular_solve_op.cu
paddle/fluid/operators/triangular_solve_op.cu
+2
-1
paddle/fluid/operators/viterbi_decode_op.cu
paddle/fluid/operators/viterbi_decode_op.cu
+1
-1
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+1
-0
paddle/fluid/platform/bfloat16_test.cu
paddle/fluid/platform/bfloat16_test.cu
+4
-0
paddle/fluid/platform/collective_helper.cc
paddle/fluid/platform/collective_helper.cc
+14
-1
paddle/fluid/platform/device/gpu/CMakeLists.txt
paddle/fluid/platform/device/gpu/CMakeLists.txt
+2
-2
paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+0
-1
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+3
-2
paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
+4
-2
paddle/fluid/platform/device/gpu/gpu_info.cc
paddle/fluid/platform/device/gpu/gpu_info.cc
+89
-18
paddle/fluid/platform/device/gpu/gpu_info.h
paddle/fluid/platform/device/gpu/gpu_info.h
+3
-2
paddle/fluid/platform/device/gpu/nccl_helper.h
paddle/fluid/platform/device/gpu/nccl_helper.h
+15
-2
paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
+0
-2
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+115
-114
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+58
-98
paddle/fluid/platform/device_context_test.cu
paddle/fluid/platform/device_context_test.cu
+28
-0
paddle/fluid/platform/device_event_gpu.cc
paddle/fluid/platform/device_event_gpu.cc
+2
-3
paddle/fluid/platform/device_event_test.cc
paddle/fluid/platform/device_event_test.cc
+4
-1
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+2
-2
paddle/fluid/platform/event.h
paddle/fluid/platform/event.h
+3
-3
paddle/fluid/platform/float16_test.cu
paddle/fluid/platform/float16_test.cu
+4
-0
paddle/fluid/platform/for_range.h
paddle/fluid/platform/for_range.h
+36
-0
paddle/fluid/platform/stream/cuda_stream.cc
paddle/fluid/platform/stream/cuda_stream.cc
+7
-9
paddle/fluid/platform/stream/cuda_stream.h
paddle/fluid/platform/stream/cuda_stream.h
+11
-5
paddle/fluid/platform/transform_test.cu
paddle/fluid/platform/transform_test.cu
+9
-0
paddle/fluid/pybind/cuda_streams_py.cc
paddle/fluid/pybind/cuda_streams_py.cc
+4
-4
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+43
-3
paddle/pten/backends/CMakeLists.txt
paddle/pten/backends/CMakeLists.txt
+8
-0
paddle/pten/backends/cpu/cpu_context.cc
paddle/pten/backends/cpu/cpu_context.cc
+26
-42
paddle/pten/backends/cpu/cpu_context.h
paddle/pten/backends/cpu/cpu_context.h
+13
-21
paddle/pten/backends/gpu/CMakeLists.txt
paddle/pten/backends/gpu/CMakeLists.txt
+9
-0
paddle/pten/backends/gpu/cuda/CMakeLists.txt
paddle/pten/backends/gpu/cuda/CMakeLists.txt
+1
-0
paddle/pten/backends/gpu/cuda/cuda_helper.h
paddle/pten/backends/gpu/cuda/cuda_helper.h
+72
-0
paddle/pten/backends/gpu/cuda/cuda_info.cc
paddle/pten/backends/gpu/cuda/cuda_info.cc
+79
-52
paddle/pten/backends/gpu/forwards.h
paddle/pten/backends/gpu/forwards.h
+111
-0
paddle/pten/backends/gpu/gpu_context.cc
paddle/pten/backends/gpu/gpu_context.cc
+899
-0
paddle/pten/backends/gpu/gpu_context.h
paddle/pten/backends/gpu/gpu_context.h
+157
-8
paddle/pten/backends/gpu/gpu_decls.h
paddle/pten/backends/gpu/gpu_decls.h
+70
-0
paddle/pten/backends/gpu/gpu_helper.h
paddle/pten/backends/gpu/gpu_helper.h
+26
-0
paddle/pten/backends/gpu/gpu_info.cc
paddle/pten/backends/gpu/gpu_info.cc
+60
-0
paddle/pten/backends/gpu/gpu_info.h
paddle/pten/backends/gpu/gpu_info.h
+132
-0
paddle/pten/backends/gpu/gpu_launch_config.h
paddle/pten/backends/gpu/gpu_launch_config.h
+181
-0
paddle/pten/backends/gpu/gpu_types.h
paddle/pten/backends/gpu/gpu_types.h
+73
-0
paddle/pten/backends/gpu/rocm/CMakeLists.txt
paddle/pten/backends/gpu/rocm/CMakeLists.txt
+1
-0
paddle/pten/backends/gpu/rocm/rocm_helper.h
paddle/pten/backends/gpu/rocm/rocm_helper.h
+72
-0
paddle/pten/backends/gpu/rocm/rocm_info.cc
paddle/pten/backends/gpu/rocm/rocm_info.cc
+78
-50
paddle/pten/backends/xpu/xpu_context.cc
paddle/pten/backends/xpu/xpu_context.cc
+38
-64
paddle/pten/backends/xpu/xpu_context.h
paddle/pten/backends/xpu/xpu_context.h
+14
-18
paddle/pten/core/device_context.cc
paddle/pten/core/device_context.cc
+7
-7
paddle/pten/core/device_context.h
paddle/pten/core/device_context.h
+3
-3
paddle/pten/kernels/funcs/cuda_kernel_config.h
paddle/pten/kernels/funcs/cuda_kernel_config.h
+2
-1
paddle/pten/kernels/funcs/elementwise_base.h
paddle/pten/kernels/funcs/elementwise_base.h
+3
-2
paddle/pten/kernels/funcs/transpose.cu
paddle/pten/kernels/funcs/transpose.cu
+1
-0
paddle/pten/kernels/gpu/concat_and_split.h
paddle/pten/kernels/gpu/concat_and_split.h
+5
-6
paddle/pten/kernels/gpu/copy_kernel.cu
paddle/pten/kernels/gpu/copy_kernel.cu
+5
-15
paddle/pten/kernels/gpu/elementwise.h
paddle/pten/kernels/gpu/elementwise.h
+4
-4
paddle/pten/kernels/gpu/reduce.h
paddle/pten/kernels/gpu/reduce.h
+15
-6
paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+5
-3
paddle/pten/tests/api/test_matmul_api.cc
paddle/pten/tests/api/test_matmul_api.cc
+2
-1
paddle/pten/tests/api/test_sparse_utils_api.cc
paddle/pten/tests/api/test_sparse_utils_api.cc
+1
-0
paddle/pten/tests/core/test_device_context.cc
paddle/pten/tests/core/test_device_context.cc
+9
-23
paddle/pten/tests/kernels/test_cast_dev_api.cc
paddle/pten/tests/kernels/test_cast_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_concat_dev_api.cc
paddle/pten/tests/kernels/test_concat_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_conj_dev_api.cc
paddle/pten/tests/kernels/test_conj_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_copy_dev_api.cc
paddle/pten/tests/kernels/test_copy_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_creation_dev_api.cc
paddle/pten/tests/kernels/test_creation_dev_api.cc
+16
-16
paddle/pten/tests/kernels/test_dot_dev_api.cc
paddle/pten/tests/kernels/test_dot_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_elementwise_dev_api.cc
paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+16
-16
paddle/pten/tests/kernels/test_flatten_dev_api.cc
paddle/pten/tests/kernels/test_flatten_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_matmul_dev_api.cc
paddle/pten/tests/kernels/test_matmul_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_mean_dev_api.cc
paddle/pten/tests/kernels/test_mean_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_reshape_dev_api.cc
paddle/pten/tests/kernels/test_reshape_dev_api.cc
+4
-4
paddle/pten/tests/kernels/test_scale_dev_api.cc
paddle/pten/tests/kernels/test_scale_dev_api.cc
+10
-8
paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
+45
-16
paddle/pten/tests/kernels/test_sum_dev_api.cc
paddle/pten/tests/kernels/test_sum_dev_api.cc
+5
-4
未找到文件。
paddle/fluid/distributed/common/utils.h
浏览文件 @
a821c4a9
...
...
@@ -33,7 +33,7 @@ namespace distributed {
template
<
typename
T
>
inline
paddle
::
operators
::
math
::
BlasT
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
GetBlas
()
{
auto
cpu_ctx
=
paddle
::
platform
::
CPUDeviceContext
()
;
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
;
return
paddle
::
operators
::
math
::
GetBlas
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
cpu_ctx
);
}
...
...
paddle/fluid/distributed/ps/service/communicator/communicator.cc
浏览文件 @
a821c4a9
...
...
@@ -1155,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
auto
&
t_latest
=
var_latest
->
Get
<
framework
::
LoDTensor
>
();
auto
t_timestamp
=
var_timestamp
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
cpu_ctx
=
paddle
::
platform
::
CPUDeviceContext
()
;
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
;
auto
*
var_delta
=
delta_scope_
->
Var
(
varname
);
auto
*
t_delta
=
var_delta
->
GetMutable
<
framework
::
LoDTensor
>
();
t_delta
->
mutable_data
<
float
>
(
t_latest
.
dims
(),
cpu_ctx
.
GetPlace
());
...
...
@@ -1185,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
RpcRecvDense
(
varnames
,
table_id
,
pserver_scope_
.
get
());
// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
auto
cpu_ctx
=
paddle
::
platform
::
CPUDeviceContext
()
;
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
;
for
(
auto
&
varname
:
varnames
)
{
auto
*
var_latest
=
recv_scope_
->
FindVar
(
varname
);
auto
t_latest
=
var_latest
->
GetMutable
<
framework
::
LoDTensor
>
();
...
...
@@ -1292,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
auto
*
t_old
=
var_old
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
dims1
=
t_latest
.
dims
()[
1
];
auto
cpu_ctx
=
paddle
::
platform
::
CPUDeviceContext
()
;
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
;
auto
*
var_delta
=
delta_scope_
->
Var
(
varname
);
auto
*
t_delta
=
var_delta
->
GetMutable
<
pten
::
SelectedRows
>
();
...
...
@@ -1370,7 +1370,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
std
::
vector
<
float
>
v_delta
;
v_delta
.
resize
(
numel
);
auto
cpu_ctx
=
paddle
::
platform
::
CPUDeviceContext
()
;
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
;
auto
blas
=
paddle
::
operators
::
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
float
>
(
cpu_ctx
);
...
...
paddle/fluid/distributed/ps/service/communicator/communicator.h
浏览文件 @
a821c4a9
...
...
@@ -179,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
}
// set output tensor to 0.
auto
cpu_ctx
=
paddle
::
platform
::
CPUDeviceContext
()
;
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
;
paddle
::
operators
::
math
::
SetConstant
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
constant_functor
;
constant_functor
(
cpu_ctx
,
out_t
,
static_cast
<
T
>
(
0
));
...
...
@@ -204,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
for
(
auto
&
var
:
vars
)
{
inputs
.
push_back
(
&
var
->
Get
<
pten
::
SelectedRows
>
());
}
auto
dev_ctx
=
paddle
::
platform
::
CPUDeviceContext
()
;
paddle
::
platform
::
CPUDeviceContext
dev_ctx
;
if
(
merge_add
)
{
paddle
::
operators
::
math
::
scatter
::
MergeAdd
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
...
...
paddle/fluid/framework/data_type_transform_test.cu
浏览文件 @
a821c4a9
...
...
@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
auto
cpu_place
=
paddle
::
platform
::
CPUPlace
();
auto
gpu_place
=
paddle
::
platform
::
CUDAPlace
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
auto
kernel_fp16
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP16
,
gpu_place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
a821c4a9
...
...
@@ -1361,7 +1361,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
auto
*
dev_ctx
=
static_cast
<
platform
::
XPUDeviceContext
*>
(
pool
.
Get
(
member_
->
places_
[
dev_id
]));
auto
&
bkcl_ctx
=
bkcl_ctxs
->
at
(
member_
->
places_
[
dev_id
]);
dev_ctx
->
set_bkcl_c
ontext
(
bkcl_ctx
.
comm
());
dev_ctx
->
SetBkclC
ontext
(
bkcl_ctx
.
comm
());
}
#else
PADDLE_THROW
(
...
...
paddle/fluid/framework/pten_utils.h
浏览文件 @
a821c4a9
...
...
@@ -77,6 +77,13 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
using
TYPE
=
pten
::
CPUContext
;
};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
>
struct
ConvertToPtenContext
<
platform
::
CUDADeviceContext
>
{
using
TYPE
=
pten
::
GPUContext
;
};
#endif
#ifdef PADDLE_WITH_XPU
template
<
>
struct
ConvertToPtenContext
<
platform
::
XPUDeviceContext
>
{
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
a821c4a9
...
...
@@ -1085,7 +1085,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is
.
seekg
(
seekg
,
is
.
cur
);
void
*
buf
;
auto
ctx
=
platform
::
CPUDeviceContext
()
;
platform
::
CPUDeviceContext
ctx
;
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
())
||
...
...
@@ -1155,7 +1155,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
std
::
copy
(
desc
.
dims
().
begin
(),
desc
.
dims
().
end
(),
std
::
back_inserter
(
dims
));
tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
void
*
buf
;
auto
ctx
=
platform
::
CPUDeviceContext
()
;
platform
::
CPUDeviceContext
ctx
;
size_t
size
=
tensor
->
numel
()
*
framework
::
SizeOfType
(
desc
.
data_type
());
if
(
platform
::
is_gpu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
())
||
...
...
@@ -1432,4 +1432,4 @@ std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
VLOG
(
1
)
<<
"PrintVar: unrecognized data type:"
<<
t
.
type
();
return
os
;
}
}
}
// namespace pten
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
a821c4a9
...
...
@@ -73,6 +73,10 @@ TEST(TensorCopy, Tensor) {
// CPU Tensor to GPU Tensor
auto
gpu_place
=
new
platform
::
CUDAPlace
(
0
);
platform
::
CUDADeviceContext
gpu_ctx
(
*
gpu_place
);
gpu_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
*
gpu_place
,
gpu_ctx
.
stream
())
.
get
());
gpu_ctx
.
PartialInitWithAllocator
();
TensorCopy
(
src_tensor
,
*
gpu_place
,
gpu_ctx
,
&
gpu_tensor
);
// GPU Tensor to CPU Tensor
...
...
@@ -166,6 +170,10 @@ TEST(TensorFromVector, Tensor) {
gpu_tensor
.
Resize
(
paddle
::
framework
::
make_ddim
({
3
,
3
}));
auto
gpu_place
=
new
paddle
::
platform
::
CUDAPlace
();
paddle
::
platform
::
CUDADeviceContext
gpu_ctx
(
*
gpu_place
);
gpu_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
*
gpu_place
,
gpu_ctx
.
stream
())
.
get
());
gpu_ctx
.
PartialInitWithAllocator
();
paddle
::
framework
::
TensorFromVector
<
int
>
(
src_vec
,
gpu_ctx
,
&
gpu_tensor
);
// Copy from GPU to CPU tensor for comparison
paddle
::
framework
::
TensorCopy
(
gpu_tensor
,
*
cpu_place
,
gpu_ctx
,
&
dst_tensor
);
...
...
@@ -230,6 +238,10 @@ TEST(TensorToVector, Tensor) {
paddle
::
framework
::
Tensor
gpu_tensor
;
paddle
::
platform
::
CUDAPlace
place
;
paddle
::
platform
::
CUDADeviceContext
gpu_ctx
(
place
);
gpu_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
gpu_ctx
.
stream
())
.
get
());
gpu_ctx
.
PartialInitWithAllocator
();
paddle
::
framework
::
TensorFromVector
<
int
>
(
src_vec
,
gpu_ctx
,
&
gpu_tensor
);
std
::
vector
<
int
>
dst
;
...
...
@@ -267,6 +279,10 @@ TEST(TensorToVector, Tensor_bool) {
paddle
::
framework
::
Tensor
gpu_tensor
;
paddle
::
platform
::
CUDAPlace
place
;
paddle
::
platform
::
CUDADeviceContext
gpu_ctx
(
place
);
gpu_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
gpu_ctx
.
stream
())
.
get
());
gpu_ctx
.
PartialInitWithAllocator
();
paddle
::
framework
::
TensorFromVector
<
bool
>
(
src_vec
,
gpu_ctx
,
&
gpu_tensor
);
std
::
vector
<
bool
>
dst
;
...
...
@@ -493,6 +509,10 @@ TEST(Tensor, FromAndToStream) {
auto
gpu_place
=
new
platform
::
CUDAPlace
();
platform
::
CUDADeviceContext
gpu_ctx
(
*
gpu_place
);
gpu_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
*
gpu_place
,
gpu_ctx
.
stream
())
.
get
());
gpu_ctx
.
PartialInitWithAllocator
();
TensorCopy
(
src_tensor
,
*
gpu_place
,
gpu_ctx
,
&
gpu_tensor
);
...
...
paddle/fluid/imperative/gloo_context.cc
浏览文件 @
a821c4a9
...
...
@@ -46,6 +46,17 @@ void GLOOParallelContext::Init() {
gloo_wrapper
->
Init
();
device_
=
std
::
unique_ptr
<
platform
::
CPUDeviceContext
>
(
new
platform
::
CPUDeviceContext
(
platform
::
CPUPlace
()));
device_
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
platform
::
CPUPlace
())
.
get
());
device_
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
device_
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
platform
::
CPUPlace
())
.
get
());
}
void
GLOOParallelContext
::
InitWithRingID
(
int
ring_id
)
{
...
...
paddle/fluid/inference/lite/test_engine_lite.cc
浏览文件 @
a821c4a9
...
...
@@ -77,6 +77,10 @@ void make_fake_model(std::string* model, std::string* param) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
#else
platform
::
CPUPlace
place
;
platform
::
CPUDeviceContext
ctx
(
place
);
...
...
paddle/fluid/inference/tensorrt/test_engine.cc
浏览文件 @
a821c4a9
...
...
@@ -27,6 +27,18 @@ class TensorRTEngineTest : public ::testing::Test {
protected:
void
SetUp
()
override
{
ctx_
=
new
platform
::
CUDADeviceContext
(
platform
::
CUDAPlace
(
0
));
ctx_
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
platform
::
CUDAPlace
(
0
),
ctx_
->
stream
())
.
get
());
ctx_
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
ctx_
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
platform
::
CUDAPlace
(
0
))
.
get
());
ctx_
->
PartialInitWithAllocator
();
engine_
=
new
TensorRTEngine
(
10
,
1
<<
10
);
engine_
->
InitNetwork
();
...
...
paddle/fluid/memory/allocation/best_fit_allocator_test.cu
浏览文件 @
a821c4a9
...
...
@@ -18,6 +18,7 @@
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
...
...
@@ -44,6 +45,10 @@ TEST(BestFitAllocator, concurrent_cuda) {
platform
::
CUDAPlace
gpu
(
0
);
platform
::
CUDADeviceContext
dev_ctx
(
gpu
);
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu
,
dev_ctx
.
stream
())
.
get
());
dev_ctx
.
PartialInitWithAllocator
();
auto
th_main
=
[
&
](
std
::
random_device
::
result_type
seed
)
{
std
::
default_random_engine
engine
(
seed
);
...
...
paddle/fluid/memory/malloc_test.cu
浏览文件 @
a821c4a9
...
...
@@ -25,6 +25,7 @@
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -105,8 +106,21 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
main_stream_alloc_ptr
.
reset
();
for
(
int
i
=
0
;
i
<
NUM_STREAMS
;
++
i
)
{
dev_ctx
.
push_back
(
std
::
unique_ptr
<
platform
::
CUDADeviceContext
>
(
new
platform
::
CUDADeviceContext
(
place
)));
auto
ctx
=
std
::
unique_ptr
<
platform
::
CUDADeviceContext
>
(
new
platform
::
CUDADeviceContext
(
place
));
ctx
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
->
stream
())
.
get
());
ctx
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
ctx
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
place
)
.
get
());
ctx
->
PartialInitWithAllocator
();
dev_ctx
.
emplace_back
(
std
::
move
(
ctx
));
MultiStreamCompute
(
&
data
[
i
],
&
second_data
[
i
],
*
dev_ctx
[
i
]);
}
...
...
@@ -144,8 +158,21 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
main_stream_alloc_ptr
.
reset
();
for
(
int
i
=
0
;
i
<
NUM_STREAMS
;
++
i
)
{
dev_ctx
.
push_back
(
std
::
unique_ptr
<
platform
::
CUDADeviceContext
>
(
new
platform
::
CUDADeviceContext
(
place
)));
auto
ctx
=
std
::
unique_ptr
<
platform
::
CUDADeviceContext
>
(
new
platform
::
CUDADeviceContext
(
place
));
ctx
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
->
stream
())
.
get
());
ctx
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
ctx
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
place
)
.
get
());
ctx
->
PartialInitWithAllocator
();
dev_ctx
.
emplace_back
(
std
::
move
(
ctx
));
threads
.
push_back
(
std
::
thread
(
MultiStreamCompute
,
&
data
[
i
],
&
second_data
[
i
],
std
::
cref
(
*
dev_ctx
[
i
])));
}
...
...
paddle/fluid/operators/arg_min_max_op_base.cu.h
浏览文件 @
a821c4a9
...
...
@@ -110,7 +110,7 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
return
block_size
;
};
int64_t
max_grid_dimx
=
ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int64_t
max_grid_dimx
=
ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
int64_t
height
=
pre
*
post
;
int64_t
width
=
n
;
int64_t
grid_size
=
height
<
max_grid_dimx
?
height
:
max_grid_dimx
;
...
...
paddle/fluid/operators/argsort_op.cu
浏览文件 @
a821c4a9
...
...
@@ -131,7 +131,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
int
block_size
=
ComputeBlockSize
(
num_cols
);
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
// actually, int num_rows < max_grid_size
int
grid_size
=
num_rows
<
maxGridDimX
?
num_rows
:
maxGridDimX
;
// Init a index array
...
...
@@ -212,7 +212,7 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
int
block_size
=
ComputeBlockSize
(
num_cols
);
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
// actually, int num_rows < max_grid_size
int
grid_size
=
num_rows
<
maxGridDimX
?
num_rows
:
maxGridDimX
;
FillGrad
<<<
grid_size
,
block_size
,
0
,
cu_stream
>>>
(
...
...
paddle/fluid/operators/broadcast_tensors_op.cu
浏览文件 @
a821c4a9
...
...
@@ -90,8 +90,8 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
// reduce_sum implementation on CUDA
auto
stream
=
context
.
cuda_device_context
().
stream
();
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
*
input_tensor
,
output_tensor
,
kps
::
IdentityFunctor
<
T
>
()
,
reduce_dims_vec
,
stream
);
context
.
cuda_device_context
(),
*
input_tensor
,
output_tensor
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims_vec
,
stream
);
}
}
}
...
...
paddle/fluid/operators/cholesky_solve_op.cu
浏览文件 @
a821c4a9
...
...
@@ -115,7 +115,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
gpuStream_t
stream
=
ctx
.
cuda_device_context
().
stream
();
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
in
,
out
,
kps
::
IdentityFunctor
<
T
>
(),
out_reduce_dims
,
stream
);
ctx
.
cuda_device_context
(),
in
,
out
,
kps
::
IdentityFunctor
<
T
>
(),
out_reduce_dims
,
stream
);
}
};
...
...
paddle/fluid/operators/clip_by_norm_op.cu
浏览文件 @
a821c4a9
...
...
@@ -77,7 +77,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
{
1
},
dev_ctx
);
TensorReduceFunctorImpl
<
platform
::
float16
,
float
,
kps
::
AddFunctor
,
kps
::
SquareFunctor
<
platform
::
float16
,
float
>>
(
*
input
,
&
tmp
,
kps
::
SquareFunctor
<
platform
::
float16
,
float
>
(),
dev_ctx
,
*
input
,
&
tmp
,
kps
::
SquareFunctor
<
platform
::
float16
,
float
>
(),
reduce_dims
,
dev_ctx
.
stream
());
auto
tmp_eigen
=
EigenVector
<
float
>::
Flatten
(
tmp
);
auto
x_norm
=
tmp_eigen
.
sqrt
();
...
...
paddle/fluid/operators/controlflow/compare_all_op.cu
浏览文件 @
a821c4a9
...
...
@@ -65,7 +65,8 @@ class CompareReduceOpKernel
auto
stream
=
context
.
cuda_device_context
().
stream
();
TensorReduceFunctorImpl
<
bool
,
bool
,
BitwiseAdd
,
kps
::
IdentityFunctor
<
bool
>>
(
tmp
,
z
,
kps
::
IdentityFunctor
<
bool
>
(),
reduce_dims
,
stream
);
context
.
cuda_device_context
(),
tmp
,
z
,
kps
::
IdentityFunctor
<
bool
>
(),
reduce_dims
,
stream
);
}
}
};
...
...
paddle/fluid/operators/copy_cross_scope_test.cc
浏览文件 @
a821c4a9
...
...
@@ -131,12 +131,20 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
TEST
(
copy_cross_scope
,
CUDA_fp32
)
{
f
::
Scope
scope
;
p
::
CUDADeviceContext
ctx
(
p
::
CUDAPlace
(
0
));
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
p
::
CUDAPlace
(
0
),
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
Compare1
<
float
>
(
&
scope
,
ctx
,
"copy_cross_scope"
);
}
TEST
(
copy_cross_scope_to_main_scope
,
CUDA_fp32
)
{
f
::
Scope
scope
;
p
::
CUDADeviceContext
ctx
(
p
::
CUDAPlace
(
0
));
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
p
::
CUDAPlace
(
0
),
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
Compare2
<
float
>
(
&
scope
,
ctx
,
"copy_cross_scope"
);
}
#elif PADDLE_WITH_ASCEND_CL
...
...
paddle/fluid/operators/elementwise/elementwise_mul_op.cu
浏览文件 @
a821c4a9
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
...
...
@@ -51,8 +52,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
auto
pt_x
=
paddle
::
experimental
::
MakePtenDenseTensor
(
*
x_lod
);
auto
pt_y
=
paddle
::
experimental
::
MakePtenDenseTensor
(
*
y_lod
);
auto
pt_z
=
paddle
::
experimental
::
MakePtenDenseTensor
(
*
z_lod
);
pten
::
MultiplyRawKernel
<
T
>
(
cuda_ctx
,
*
pt_x
.
get
(),
*
pt_y
.
get
(),
axis
,
pt_z
.
get
());
pten
::
MultiplyRawKernel
<
T
>
(
static_cast
<
const
pten
::
GPUContext
&>
(
cuda_ctx
)
,
*
pt_x
.
get
(),
*
pt_y
.
get
(),
axis
,
pt_z
.
get
());
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"X's type[%s] is not supported by elementwise_op. X's type should be "
...
...
paddle/fluid/operators/elementwise/elementwise_op_function.h
浏览文件 @
a821c4a9
...
...
@@ -1189,7 +1189,8 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework
::
Tensor
*
src
,
framework
::
Tensor
*
dst
)
{
std
::
vector
<
int
>
reduce_dims
=
GetReduceDim
(
dst
->
dims
(),
src
->
dims
(),
axis
);
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
*
src
,
dst
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
dev_ctx
.
stream
());
dev_ctx
,
*
src
,
dst
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
dev_ctx
.
stream
());
}
template
<
ElementwiseType
ET
,
typename
T
,
typename
Functor
>
...
...
paddle/fluid/operators/feed_forward_test.cu
浏览文件 @
a821c4a9
...
...
@@ -275,6 +275,18 @@ class TestFeedForward {
output_size_
=
3
*
num_head_
*
dim_head_
;
input_size_
=
dim_embed_
;
ctx_
=
new
platform
::
CUDADeviceContext
(
place_
);
ctx_
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place_
,
ctx_
->
stream
())
.
get
());
ctx_
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
ctx_
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
place_
)
.
get
());
ctx_
->
PartialInitWithAllocator
();
size_src_
=
bsz_seq_
*
dim_embed_
;
// src: [bs, seq_len, em_dim]
size_weight_
=
dim_embed_
*
output_size_
;
// weight: [output_size, em_dim]
...
...
paddle/fluid/operators/fused/attn_gemm.h
浏览文件 @
a821c4a9
...
...
@@ -166,7 +166,8 @@ class AttnMatMul {
if
(
support_case_1
||
support_case_2
)
{
gpuStream_t
stream
=
dev_ctx_
.
stream
();
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
*
d_output
,
d_bias
,
kps
::
IdentityFunctor
<
T
>
(),
{
0
,
1
},
stream
);
dev_ctx_
,
*
d_output
,
d_bias
,
kps
::
IdentityFunctor
<
T
>
(),
{
0
,
1
},
stream
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support reduce when the input dims are [0,1,2,3,4] and "
...
...
paddle/fluid/operators/gelu_op.cu
浏览文件 @
a821c4a9
...
...
@@ -108,7 +108,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
is_aligned(y, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()
.x);
\
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()
[0]);
\
VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluFwdCUDAKernel< \
...
...
@@ -144,7 +144,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
is_aligned(x_g, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()
.x);
\
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()
[0]);
\
VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluBwdCUDAKernel< \
...
...
paddle/fluid/operators/graph_khop_sampler_op.cu
浏览文件 @
a821c4a9
...
...
@@ -260,7 +260,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
int
block
=
1024
;
#endif
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
int
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
int
grid_tmp
=
(
num_input
+
block
-
1
)
/
block
;
int
grid
=
grid_tmp
<
max_grid_dimx
?
grid_tmp
:
max_grid_dimx
;
// 1. Insert data into keys and values.
...
...
@@ -334,7 +334,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
int
block
=
1024
;
#endif
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
int64_t
grid_tmp
=
(
outputs
->
size
()
+
block
-
1
)
/
block
;
int64_t
grid
=
grid_tmp
<
max_grid_dimx
?
grid_tmp
:
max_grid_dimx
;
ReindexSrcOutput
<
...
...
paddle/fluid/operators/graph_send_recv_op.cu
浏览文件 @
a821c4a9
...
...
@@ -197,7 +197,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(
#endif
int64_t
n
=
slice_size
*
index_size
;
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
int64_t
grid_tmp
=
(
n
+
block
-
1
)
/
block
;
int64_t
grid
=
grid_tmp
<
max_grid_dimx
?
grid_tmp
:
max_grid_dimx
;
int64_t
input_size
=
src_dims
[
0
];
...
...
@@ -320,7 +320,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
#endif
int64_t
n
=
slice_size
*
index_size
;
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
int64_t
grid_tmp
=
(
n
+
block
-
1
)
/
block
;
int64_t
grid
=
grid_tmp
<
max_grid_dimx
?
grid_tmp
:
max_grid_dimx
;
int64_t
input_size
=
src_dims
[
0
];
...
...
paddle/fluid/operators/gumbel_softmax_op.cu
浏览文件 @
a821c4a9
...
...
@@ -92,7 +92,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
const
int
size_from_axis
=
SizeFromAxis
(
axis
,
X
.
dims
());
const
int
size_out_axis
=
SizeOutAxis
(
axis
,
X
.
dims
());
constexpr
int
thread_size
=
512
;
int64_t
max_grid_dimx
=
context
.
GetCUDAMaxGridDimSize
()
.
x
;
int64_t
max_grid_dimx
=
context
.
GetCUDAMaxGridDimSize
()
[
0
]
;
int64_t
height
=
size_to_axis
*
size_out_axis
;
int
block_size
=
height
<
max_grid_dimx
?
height
:
max_grid_dimx
;
...
...
paddle/fluid/operators/index_sample_op.cu
浏览文件 @
a821c4a9
...
...
@@ -27,10 +27,10 @@ namespace operators {
namespace
{
void
LimitGridDim
(
const
framework
::
ExecutionContext
&
ctx
,
dim3
*
grid_dim
)
{
dim3
max_grid_dim
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>()
auto
max_grid_dim
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>()
.
GetCUDAMaxGridDimSize
();
grid_dim
->
x
=
grid_dim
->
x
<
max_grid_dim
.
x
?
grid_dim
->
x
:
max_grid_dim
.
x
;
grid_dim
->
y
=
grid_dim
->
y
<
max_grid_dim
.
y
?
grid_dim
->
y
:
max_grid_dim
.
y
;
grid_dim
->
x
=
grid_dim
->
x
<
max_grid_dim
[
0
]
?
grid_dim
->
x
:
max_grid_dim
[
0
]
;
grid_dim
->
y
=
grid_dim
->
y
<
max_grid_dim
[
1
]
?
grid_dim
->
y
:
max_grid_dim
[
1
]
;
}
}
...
...
paddle/fluid/operators/interpolate_v2_op.cu
浏览文件 @
a821c4a9
...
...
@@ -45,11 +45,11 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
int
block_y
=
std
::
min
(
GetLastPow2
(
height
),
max_threads
/
block_x
);
int
block_z
=
std
::
min
(
num_img
,
max_threads
/
block_x
/
block_y
);
dim3
max_grid_dim
=
context
.
GetCUDAMaxGridDimSize
();
int
grid_x
=
std
::
min
<
int
>
(
max_grid_dim
.
x
,
platform
::
DivUp
(
width
,
block_x
));
int
grid_y
=
std
::
min
<
int
>
(
max_grid_dim
.
y
,
platform
::
DivUp
(
height
,
block_y
));
auto
max_grid_dim
=
context
.
GetCUDAMaxGridDimSize
();
int
grid_x
=
std
::
min
<
int
>
(
max_grid_dim
[
0
]
,
platform
::
DivUp
(
width
,
block_x
));
int
grid_y
=
std
::
min
<
int
>
(
max_grid_dim
[
1
]
,
platform
::
DivUp
(
height
,
block_y
));
int
grid_z
=
std
::
min
<
int
>
(
max_grid_dim
.
z
,
platform
::
DivUp
(
num_img
,
block_z
*
4
));
std
::
min
<
int
>
(
max_grid_dim
[
2
]
,
platform
::
DivUp
(
num_img
,
block_z
*
4
));
const
int
capability
=
context
.
GetComputeCapability
();
platform
::
GpuLaunchConfig
config
;
...
...
paddle/fluid/operators/kron_op.h
浏览文件 @
a821c4a9
...
...
@@ -306,11 +306,11 @@ struct KronGradOpFunctor {
auto
stream
=
dev_ctx
.
stream
();
// it is a cuda device_context
if
(
dx
)
{
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dout_x
,
dx
,
kps
::
IdentityFunctor
<
T
>
(),
{
1
},
stream
);
d
ev_ctx
,
d
out_x
,
dx
,
kps
::
IdentityFunctor
<
T
>
(),
{
1
},
stream
);
}
if
(
dy
)
{
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dout_y
,
dy
,
kps
::
IdentityFunctor
<
T
>
(),
{
1
},
stream
);
d
ev_ctx
,
d
out_y
,
dy
,
kps
::
IdentityFunctor
<
T
>
(),
{
1
},
stream
);
}
#else
auto
*
place
=
dev_ctx
.
eigen_device
();
...
...
paddle/fluid/operators/kthvalue_op.cu
浏览文件 @
a821c4a9
...
...
@@ -54,7 +54,7 @@ bool SortKthvalue(const platform::CUDADeviceContext& ctx,
input_indices
.
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
size_t
temp_storage_bytes
=
-
1
;
int
block_size
=
getBlockSize
(
num_cols
);
unsigned
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
unsigned
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
unsigned
int
grid_size
=
num_rows
<
maxGridDimX
?
static_cast
<
unsigned
int
>
(
num_rows
)
:
maxGridDimX
;
...
...
paddle/fluid/operators/lite/lite_engine_op_test.cc
浏览文件 @
a821c4a9
...
...
@@ -72,6 +72,10 @@ TEST(LiteEngineOp, engine_op) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
#else
platform
::
CPUPlace
place
;
platform
::
CPUDeviceContext
ctx
(
place
);
...
...
paddle/fluid/operators/margin_cross_entropy_op.cu
浏览文件 @
a821c4a9
...
...
@@ -299,7 +299,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
N
,
1
},
dev_ctx
);
T
*
logits_max_buff
=
logits_max
.
mutable_data
<
T
>
(
place
);
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
MaxFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
softmax_2d
,
&
logits_max
,
kps
::
IdentityFunctor
<
T
>
(),
{
1
},
dev_ctx
,
softmax_2d
,
&
logits_max
,
kps
::
IdentityFunctor
<
T
>
(),
{
1
},
dev_ctx
.
stream
());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
...
...
@@ -321,7 +321,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
N
,
1
},
dev_ctx
);
T
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
ExpFunctor
<
T
>>
(
softmax_2d
,
&
sum_exp_logits
,
kps
::
ExpFunctor
<
T
>
(),
{
1
},
dev_ctx
,
softmax_2d
,
&
sum_exp_logits
,
kps
::
ExpFunctor
<
T
>
(),
{
1
},
dev_ctx
.
stream
());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
...
...
paddle/fluid/operators/math/beam_search_test.cc
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
void
PrepareCPUTensors
(
paddle
::
framework
::
LoDTensor
*
ids
,
paddle
::
framework
::
LoDTensor
*
scores
,
...
...
@@ -129,6 +131,83 @@ void TestBeamSearch() {
delete
context
;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
>
void
TestBeamSearch
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
()
{
paddle
::
framework
::
LoDTensor
ids
;
paddle
::
framework
::
LoDTensor
scores
;
paddle
::
framework
::
LoDTensor
pre_ids
;
paddle
::
framework
::
LoDTensor
pre_scores
;
auto
*
place
=
new
paddle
::
platform
::
CUDAPlace
();
auto
*
context
=
new
paddle
::
platform
::
CUDADeviceContext
(
*
place
);
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
*
place
,
context
->
stream
())
.
get
());
context
->
PartialInitWithAllocator
();
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
PrepareCPUTensors
(
&
ids
,
&
scores
,
&
pre_ids
,
&
pre_scores
);
}
else
{
paddle
::
framework
::
LoDTensor
cpu_ids
;
paddle
::
framework
::
LoDTensor
cpu_scores
;
paddle
::
framework
::
LoDTensor
cpu_pre_ids
;
paddle
::
framework
::
LoDTensor
cpu_pre_scores
;
PrepareCPUTensors
(
&
cpu_ids
,
&
cpu_scores
,
&
cpu_pre_ids
,
&
cpu_pre_scores
);
paddle
::
framework
::
TensorCopySync
(
cpu_ids
,
*
place
,
&
ids
);
paddle
::
framework
::
TensorCopySync
(
cpu_scores
,
*
place
,
&
scores
);
paddle
::
framework
::
TensorCopySync
(
cpu_pre_ids
,
*
place
,
&
pre_ids
);
paddle
::
framework
::
TensorCopySync
(
cpu_pre_scores
,
*
place
,
&
pre_scores
);
ids
.
set_lod
(
cpu_ids
.
lod
());
scores
.
set_lod
(
cpu_scores
.
lod
());
pre_ids
.
set_lod
(
cpu_pre_ids
.
lod
());
pre_scores
.
set_lod
(
cpu_pre_scores
.
lod
());
}
paddle
::
framework
::
LoDTensor
selected_ids
;
paddle
::
framework
::
LoDTensor
selected_scores
;
paddle
::
framework
::
LoDTensor
parent_idx
;
size_t
level
=
0
;
size_t
beam_size
=
2
;
int
end_id
=
0
;
paddle
::
operators
::
math
::
BeamSearchFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
beamsearch
;
beamsearch
(
*
context
,
&
pre_ids
,
&
pre_scores
,
&
ids
,
&
scores
,
&
selected_ids
,
&
selected_scores
,
&
parent_idx
,
level
,
beam_size
,
end_id
,
true
);
ASSERT_EQ
(
selected_ids
.
lod
(),
selected_scores
.
lod
());
paddle
::
framework
::
LoDTensor
cpu_selected_ids
;
paddle
::
framework
::
LoDTensor
cpu_selected_scores
;
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
cpu_selected_ids
=
selected_ids
;
cpu_selected_scores
=
selected_scores
;
}
else
{
paddle
::
framework
::
TensorCopySync
(
selected_ids
,
paddle
::
platform
::
CPUPlace
(),
&
cpu_selected_ids
);
paddle
::
framework
::
TensorCopySync
(
selected_scores
,
paddle
::
platform
::
CPUPlace
(),
&
cpu_selected_scores
);
cpu_selected_ids
.
set_lod
(
selected_ids
.
lod
());
cpu_selected_scores
.
set_lod
(
selected_scores
.
lod
());
}
std
::
vector
<
int64_t
>
expected_ids
({
4
,
5
,
3
,
8
});
std
::
vector
<
float
>
expected_scores
({
0.6
f
,
0.5
f
,
0.9
f
,
0.7
f
});
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
ASSERT_EQ
(
expected_ids
[
i
],
cpu_selected_ids
.
data
<
int64_t
>
()[
i
]);
ASSERT_EQ
(
expected_scores
[
i
],
cpu_selected_scores
.
data
<
float
>
()[
i
]);
}
delete
place
;
delete
context
;
}
#endif
TEST
(
BeamSearch
,
CPU
)
{
TestBeamSearch
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
...
...
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
a821c4a9
...
...
@@ -18,6 +18,7 @@
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
DECLARE_bool
(
enable_cublas_tensor_op_math
);
...
...
@@ -92,6 +93,32 @@ struct CUBlas<float> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
pten
::
GPUContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
float
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
)
{
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
#if CUDA_VERSION >= 8000
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
dev_ctx
->
tensor_core_available
()
?
"True"
:
"False"
);
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasSgemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
));
});
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"cublasSgemmEx is not supported on cuda <= 7.5"
));
#endif
}
template
<
typename
...
ARGS
>
static
void
TRSM
(
ARGS
...
args
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasStrsm
(
args
...));
...
...
@@ -273,6 +300,37 @@ struct CUBlas<platform::float16> {
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
pten
::
GPUContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
};
template
<
>
...
...
@@ -388,6 +446,37 @@ struct CUBlas<platform::complex<float>> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
pten
::
GPUContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
static
void
TRSM_BATCH
(
cublasHandle_t
handle
,
cublasSideMode_t
side
,
cublasFillMode_t
uplo
,
cublasOperation_t
transa
,
cublasDiagType_t
diag
,
int
m
,
int
n
,
...
...
@@ -529,6 +618,37 @@ struct CUBlas<platform::complex<double>> {
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
pten
::
GPUContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
};
template
<
>
...
...
@@ -564,6 +684,39 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
N
);
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
...
...
@@ -611,6 +764,55 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
platform
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
platform
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16F
,
ldb
,
A
,
CUDA_R_16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
CUDA_R_32F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
platform
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
h_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
...
...
@@ -659,6 +861,56 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
complex
<
float
>
alpha
,
const
platform
::
complex
<
float
>
*
A
,
const
platform
::
complex
<
float
>
*
B
,
platform
::
complex
<
float
>
beta
,
platform
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
platform
::
errors
::
InvalidArgument
(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
platform
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
CUDA_C_32F
,
ldb
,
A
,
CUDA_C_32F
,
lda
,
&
c_beta
,
C
,
CUDA_C_32F
,
N
,
CUDA_C_32F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
platform
::
complex
<
float
>>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
c_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
...
...
@@ -708,6 +960,57 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
complex
<
double
>
alpha
,
const
platform
::
complex
<
double
>
*
A
,
const
platform
::
complex
<
double
>
*
B
,
platform
::
complex
<
double
>
beta
,
platform
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
platform
::
errors
::
InvalidArgument
(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
platform
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
CUDA_C_64F
,
ldb
,
A
,
CUDA_C_64F
,
lda
,
&
c_beta
,
C
,
CUDA_C_64F
,
N
,
CUDA_C_64F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
platform
::
complex
<
double
>>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
c_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
...
...
@@ -738,6 +1041,35 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
);
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMM
(
...
...
@@ -755,6 +1087,25 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
});
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
int
lda
,
const
platform
::
float16
*
B
,
int
ldb
,
platform
::
float16
beta
,
platform
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
platform
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
...
...
@@ -764,6 +1115,14 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
...
...
@@ -771,6 +1130,13 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
...
...
@@ -778,6 +1144,13 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
...
...
@@ -790,6 +1163,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMV
(
...
...
@@ -806,6 +1190,24 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
}
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
...
...
@@ -854,6 +1256,56 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
#endif // CUDA_VERSION >= 9010
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
#if CUDA_VERSION >= 9010
if
((
FLAGS_enable_cublas_tensor_op_math
&&
(
std
::
is_same
<
T
,
float
>::
value
))
||
std
::
is_same
<
T
,
paddle
::
platform
::
float16
>::
value
)
{
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
auto
fp
=
std
::
is_same
<
T
,
float
>::
value
?
CUDA_R_32F
:
CUDA_R_16F
;
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
fp
,
ldb
,
strideB
,
A
,
fp
,
lda
,
strideA
,
&
beta
,
C
,
fp
,
ldc
,
strideC
,
batchCount
,
fp
,
algo
));
});
}
else
{
#endif // CUDA_VERSION >= 9010
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
});
#if CUDA_VERSION >= 9010
}
#endif // CUDA_VERSION >= 9010
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
...
...
@@ -865,6 +1317,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
**
A
,
const
T
**
B
,
T
beta
,
T
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
...
...
@@ -878,6 +1343,19 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
}
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
**
A
,
const
platform
::
float16
**
B
,
platform
::
float16
beta
,
platform
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
platform
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
...
...
@@ -903,6 +1381,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
int
lda
,
T
*
B
,
int
ldb
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t
cuSide
=
(
side
==
CblasLeft
)
?
CUBLAS_SIDE_RIGHT
:
CUBLAS_SIDE_LEFT
;
cublasFillMode_t
cuUplo
=
(
uplo
==
CblasLower
)
?
CUBLAS_FILL_MODE_UPPER
:
CUBLAS_FILL_MODE_LOWER
;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasDiagType_t
cuDiag
=
(
diag
==
CblasUnit
)
?
CUBLAS_DIAG_UNIT
:
CUBLAS_DIAG_NON_UNIT
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
TRSM
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGETRF
(
int
n
,
T
**
a
,
int
*
ipiv
,
...
...
@@ -913,6 +1415,15 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGETRF
(
int
n
,
T
**
a
,
int
*
ipiv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRF_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGETRI
(
int
n
,
const
T
**
a
,
...
...
@@ -931,6 +1442,23 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGETRI
(
int
n
,
const
T
**
a
,
const
int
*
ipiv
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
PADDLE_ENFORCE_NE
(
a_inv
,
a
,
platform
::
errors
::
InvalidArgument
(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p)."
,
a_inv
,
a
));
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRI_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedMatInv
(
int
n
,
const
T
**
a
,
...
...
@@ -941,6 +1469,15 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedMatInv
(
int
n
,
const
T
**
a
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
MATINV_BATCH
(
handle
,
n
,
a
,
n
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGETRS
(
...
...
@@ -955,6 +1492,21 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
int
n
,
int
nrhs
,
const
T
**
a
,
int
lda
,
int
*
ipiv
,
T
**
b
,
int
ldb
,
int
*
info
,
int
batch_size
)
const
{
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTrans
=
(
trans
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRS_BATCH
(
handle
,
cuTrans
,
n
,
nrhs
,
a
,
lda
,
ipiv
,
b
,
ldb
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedTRSM
(
...
...
@@ -979,6 +1531,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
**
A
,
int
lda
,
T
**
B
,
int
ldb
,
int
batch_size
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t
cuSide
=
(
side
==
CblasLeft
)
?
CUBLAS_SIDE_RIGHT
:
CUBLAS_SIDE_LEFT
;
cublasFillMode_t
cuUplo
=
(
uplo
==
CblasLower
)
?
CUBLAS_FILL_MODE_UPPER
:
CUBLAS_FILL_MODE_LOWER
;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasDiagType_t
cuDiag
=
(
diag
==
CblasUnit
)
?
CUBLAS_DIAG_UNIT
:
CUBLAS_DIAG_NON_UNIT
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
TRSM_BATCH
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
batch_size
);
});
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/blas_impl.hip.h
浏览文件 @
a821c4a9
...
...
@@ -17,6 +17,7 @@
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
DECLARE_bool
(
enable_cublas_tensor_op_math
);
...
...
@@ -221,6 +222,20 @@ struct CUBlas<platform::float16> {
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
pten
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
};
template
<
>
...
...
@@ -305,6 +320,20 @@ struct CUBlas<platform::complex<float>> {
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
pten
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
};
template
<
>
...
...
@@ -389,6 +418,20 @@ struct CUBlas<platform::complex<double>> {
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
pten
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
};
template
<
>
...
...
@@ -412,6 +455,27 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
&
beta
,
C
,
N
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
});
}
template
<
>
template
<
>
...
...
@@ -448,6 +512,43 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f16_r
,
ldb
,
A
,
rocblas_datatype_f16_r
,
lda
,
&
h_beta
,
C
,
rocblas_datatype_f16_r
,
N
,
rocblas_datatype_f32_r
);
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
platform
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
platform
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
rocblas_datatype_f16_r
,
ldb
,
A
,
rocblas_datatype_f16_r
,
lda
,
&
h_beta
,
C
,
rocblas_datatype_f16_r
,
N
,
rocblas_datatype_f32_r
);
}
template
<
>
template
<
>
...
...
@@ -485,6 +586,44 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f32_c
,
ldb
,
A
,
rocblas_datatype_f32_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f32_c
,
N
,
rocblas_datatype_f32_c
);
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
complex
<
float
>
alpha
,
const
platform
::
complex
<
float
>
*
A
,
const
platform
::
complex
<
float
>
*
B
,
platform
::
complex
<
float
>
beta
,
platform
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
platform
::
errors
::
InvalidArgument
(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
platform
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
rocblas_datatype_f32_c
,
ldb
,
A
,
rocblas_datatype_f32_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f32_c
,
N
,
rocblas_datatype_f32_c
);
}
template
<
>
template
<
>
...
...
@@ -523,6 +662,45 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
rocblas_datatype_f64_c
,
ldb
,
A
,
rocblas_datatype_f64_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f64_c
,
N
,
rocblas_datatype_f64_c
);
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
complex
<
double
>
alpha
,
const
platform
::
complex
<
double
>
*
A
,
const
platform
::
complex
<
double
>
*
B
,
platform
::
complex
<
double
>
beta
,
platform
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
platform
::
errors
::
InvalidArgument
(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
auto
&
cuda_ctx
=
const_cast
<
pten
::
GPUContext
&>
(
context_
);
CUBlas
<
platform
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
rocblas_datatype_f64_c
,
ldb
,
A
,
rocblas_datatype_f64_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f64_c
,
N
,
rocblas_datatype_f64_c
);
}
template
<
>
template
<
typename
T
>
...
...
@@ -541,6 +719,22 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation
cuTransA
=
transA
?
rocblas_operation_transpose
:
rocblas_operation_none
;
rocblas_operation
cuTransB
=
transB
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
>
...
...
@@ -560,6 +754,26 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
int
lda
,
const
platform
::
float16
*
B
,
int
ldb
,
platform
::
float16
beta
,
platform
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation
cuTransA
=
transA
?
rocblas_operation_transpose
:
rocblas_operation_none
;
rocblas_operation
cuTransB
=
transB
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
platform
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -569,6 +783,13 @@ void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -576,6 +797,12 @@ void Blas<platform::CUDADeviceContext>::SCAL(int n, const T alpha, T *x) const {
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -583,6 +810,12 @@ void Blas<platform::CUDADeviceContext>::VCOPY(int n, const T *x, T *y) const {
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -596,6 +829,17 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
rocblas_operation
cuTransA
=
!
trans_a
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
>
...
...
@@ -612,6 +856,23 @@ inline void Blas<platform::CUDADeviceContext>::GEMV(
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
typename
T
>
...
...
@@ -637,6 +898,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
ldc
,
strideC
,
batchCount
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
const
int64_t
strideC
=
M
*
N
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -648,6 +935,18 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
C
[
k
]);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
**
A
,
const
T
**
B
,
T
beta
,
T
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
...
...
@@ -661,6 +960,18 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
inline
void
Blas
<
pten
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
**
A
,
const
platform
::
float16
**
B
,
platform
::
float16
beta
,
platform
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
platform
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
typename
T
>
...
...
@@ -687,6 +998,30 @@ void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
lda
,
B
,
ldb
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
int
lda
,
T
*
B
,
int
ldb
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side
cuSide
=
(
side
==
CblasLeft
)
?
rocblas_side_right
:
rocblas_side_left
;
rocblas_fill
cuUplo
=
(
uplo
==
CblasLower
)
?
rocblas_fill_upper
:
rocblas_fill_lower
;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_diagonal
cuDiag
=
(
diag
==
CblasUnit
)
?
rocblas_diagonal_unit
:
rocblas_diagonal_non_unit
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
TRSM
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -697,6 +1032,14 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRF(int n, T **a, int *ipiv,
CUBlas
<
T
>::
GETRF_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGETRF
(
int
n
,
T
**
a
,
int
*
ipiv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRF_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -715,6 +1058,22 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRI(int n, const T **a,
CUBlas
<
T
>::
GETRI_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGETRI
(
int
n
,
const
T
**
a
,
const
int
*
ipiv
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
PADDLE_ENFORCE_NE
(
a_inv
,
a
,
platform
::
errors
::
InvalidArgument
(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p)."
,
a_inv
,
a
));
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRI_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -725,6 +1084,14 @@ void Blas<platform::CUDADeviceContext>::BatchedMatInv(int n, const T **a,
CUBlas
<
T
>::
MATINV_BATCH
(
handle
,
n
,
a
,
n
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedMatInv
(
int
n
,
const
T
**
a
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
MATINV_BATCH
(
handle
,
n
,
a
,
n
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -739,6 +1106,20 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
int
n
,
int
nrhs
,
const
T
**
a
,
int
lda
,
int
*
ipiv
,
T
**
b
,
int
ldb
,
int
*
info
,
int
batch_size
)
const
{
rocblas_operation
cuTrans
=
(
trans
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRS_BATCH
(
handle
,
cuTrans
,
n
,
nrhs
,
a
,
lda
,
ipiv
,
b
,
ldb
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
...
...
@@ -764,6 +1145,31 @@ void Blas<platform::CUDADeviceContext>::BatchedTRSM(
&
alpha
,
A
,
lda
,
B
,
ldb
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
pten
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
**
A
,
int
lda
,
T
**
B
,
int
ldb
,
int
batch_size
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side
cuSide
=
(
side
==
CblasLeft
)
?
rocblas_side_right
:
rocblas_side_left
;
rocblas_fill
cuUplo
=
(
uplo
==
CblasLower
)
?
rocblas_fill_upper
:
rocblas_fill_lower
;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_diagonal
cuDiag
=
(
diag
==
CblasUnit
)
?
rocblas_diagonal_unit
:
rocblas_diagonal_non_unit
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
TRSM_BATCH
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
batch_size
);
});
}
}
// namespace math
}
// namespace operators
...
...
paddle/fluid/operators/math/concat_test.cc
浏览文件 @
a821c4a9
...
...
@@ -16,6 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
/**
* case 1:
...
...
@@ -441,6 +443,31 @@ void TestConcatMain() {
delete
context
;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
>
void
TestConcatMain
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
()
{
auto
*
context
=
new
paddle
::
platform
::
CUDADeviceContext
(
paddle
::
platform
::
CUDAPlace
());
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CUDAPlace
(),
context
->
stream
())
.
get
());
context
->
PartialInitWithAllocator
();
ConcatCase1
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
(
context
);
ConcatCase2
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
(
context
);
ConcatCase3
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
(
context
);
ConcatCase4
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
(
context
);
delete
context
;
}
#endif
TEST
(
math
,
concat
)
{
TestConcatMain
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
...
...
paddle/fluid/operators/math/cusparse_conversion_api_test.cc
浏览文件 @
a821c4a9
...
...
@@ -24,6 +24,11 @@ void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
const
int
rows
,
const
int
cols
)
{
paddle
::
platform
::
CUDADeviceContext
*
context
=
new
paddle
::
platform
::
CUDADeviceContext
(
paddle
::
platform
::
CUDAPlace
());
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CUDAPlace
(),
context
->
stream
())
.
get
());
context
->
PartialInitWithAllocator
();
auto
sparse
=
paddle
::
operators
::
math
::
GetSparse
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
*
context
);
...
...
@@ -61,6 +66,11 @@ void TestDenseToSparse(const std::vector<T>& correct_dense_data,
const
std
::
string
&
mode
)
{
paddle
::
platform
::
CUDADeviceContext
*
context
=
new
paddle
::
platform
::
CUDADeviceContext
(
paddle
::
platform
::
CUDAPlace
());
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CUDAPlace
(),
context
->
stream
())
.
get
());
context
->
PartialInitWithAllocator
();
// get sparse
auto
sparse
=
paddle
::
operators
::
math
::
GetSparse
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template
<
typename
DeviceContext
,
typename
Place
>
void
testIm2col
()
{
...
...
@@ -60,6 +62,7 @@ void testIm2col() {
auto
*
place
=
new
Place
();
DeviceContext
*
context
=
new
DeviceContext
(
*
place
);
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
...
...
@@ -164,6 +167,165 @@ void testIm2col() {
delete
context
;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
>
void
testIm2col
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
()
{
paddle
::
framework
::
Tensor
input_tmp
;
paddle
::
framework
::
Tensor
input
;
paddle
::
framework
::
Tensor
output_cfo
;
paddle
::
framework
::
Tensor
output_ocf
;
paddle
::
framework
::
Tensor
output_tmp
;
/**
* input = [0, 1, 2,
* 3, 4, 5]
*
* output_cfo = [0, 1
* 1, 2
* 3, 4
* 4, 5]
*
* output_ocf = [0, 1, 3, 4
* 1, 2, 4, 5]
*
* col2im_cfo = [0, 2, 2
* 3, 4, 5]
*
* col2im_ocf = [0, 2, 2
* 3, 4, 5]
*/
int
input_height
=
2
;
int
input_width
=
3
;
int
filter_size
=
2
;
std
::
vector
<
int
>
stride
({
1
,
1
});
// stride_y, stride_x
std
::
vector
<
int
>
padding
(
{
0
,
0
,
0
,
0
});
// up_pad, left_pad, down_pad, right_pad
std
::
vector
<
int
>
dilation
({
1
,
1
});
// dilation_y, dilation_x
int
output_height
=
(
input_height
-
filter_size
+
padding
[
0
]
+
padding
[
1
])
/
stride
[
0
]
+
1
;
int
output_width
=
(
input_width
-
filter_size
+
padding
[
2
]
+
padding
[
3
])
/
stride
[
1
]
+
1
;
float
*
input_ptr
=
input_tmp
.
mutable_data
<
float
>
(
{
1
,
input_height
,
input_width
},
paddle
::
platform
::
CPUPlace
());
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
memcpy
(
input_ptr
,
arr
,
6
*
sizeof
(
float
));
auto
*
place
=
new
paddle
::
platform
::
CUDAPlace
();
auto
*
context
=
new
paddle
::
platform
::
CUDADeviceContext
(
*
place
);
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
*
place
,
context
->
stream
())
.
get
());
context
->
PartialInitWithAllocator
();
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
paddle
::
framework
::
TensorCopySync
(
input_tmp
,
*
place
,
&
input
);
}
output_cfo
.
mutable_data
<
float
>
(
{
1
,
filter_size
,
filter_size
,
output_height
,
output_width
},
*
place
);
output_ocf
.
mutable_data
<
float
>
(
{
output_height
,
output_width
,
1
,
filter_size
,
filter_size
},
*
place
);
// Im2Col
paddle
::
operators
::
math
::
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
paddle
::
platform
::
CUDADeviceContext
,
float
>
im2col
;
paddle
::
operators
::
math
::
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
paddle
::
platform
::
CUDADeviceContext
,
float
>
im2col_ocf
;
im2col
(
*
context
,
input
,
dilation
,
stride
,
padding
,
&
output_cfo
);
im2col_ocf
(
*
context
,
input
,
dilation
,
stride
,
padding
,
&
output_ocf
);
float
out_cfo_data
[]
=
{
0
,
1
,
1
,
2
,
3
,
4
,
4
,
5
};
float
out_ocf_data
[]
=
{
0
,
1
,
3
,
4
,
1
,
2
,
4
,
5
};
float
*
out_cfo_ptr
;
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
out_cfo_ptr
=
output_cfo
.
data
<
float
>
();
}
else
{
paddle
::
framework
::
TensorCopySync
(
output_cfo
,
paddle
::
platform
::
CPUPlace
(),
&
output_tmp
);
out_cfo_ptr
=
output_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
EXPECT_EQ
(
out_cfo_ptr
[
i
],
out_cfo_data
[
i
]);
}
float
*
out_ocf_ptr
;
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
out_ocf_ptr
=
output_ocf
.
data
<
float
>
();
}
else
{
paddle
::
framework
::
TensorCopySync
(
output_ocf
,
paddle
::
platform
::
CPUPlace
(),
&
output_tmp
);
out_ocf_ptr
=
output_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
EXPECT_EQ
(
out_ocf_ptr
[
i
],
out_ocf_data
[
i
]);
}
// Col2Im: kCFO
paddle
::
operators
::
math
::
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
paddle
::
platform
::
CUDADeviceContext
,
float
>
col2im
;
paddle
::
operators
::
math
::
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
paddle
::
platform
::
CUDADeviceContext
,
float
>
col2im_ocf
;
float
col2im_data
[]
=
{
0
,
2
,
2
,
3
,
8
,
5
};
memset
(
input_ptr
,
0
,
6
*
sizeof
(
float
));
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
paddle
::
framework
::
TensorCopySync
(
input_tmp
,
*
place
,
&
input
);
}
col2im
(
*
context
,
output_cfo
,
dilation
,
stride
,
padding
,
&
input
);
float
*
in_ptr
;
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
in_ptr
=
input
.
data
<
float
>
();
}
else
{
paddle
::
framework
::
TensorCopySync
(
input
,
paddle
::
platform
::
CPUPlace
(),
&
input_tmp
);
in_ptr
=
input_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
EXPECT_EQ
(
in_ptr
[
i
],
col2im_data
[
i
]);
}
// Col2Im: kOCF
memset
(
input_ptr
,
0
,
6
*
sizeof
(
float
));
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
paddle
::
framework
::
TensorCopySync
(
input_tmp
,
*
place
,
&
input
);
}
col2im_ocf
(
*
context
,
output_ocf
,
dilation
,
stride
,
padding
,
&
input
);
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
in_ptr
=
input
.
data
<
float
>
();
}
else
{
paddle
::
framework
::
TensorCopySync
(
input
,
paddle
::
platform
::
CPUPlace
(),
&
input_tmp
);
in_ptr
=
input_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
EXPECT_EQ
(
in_ptr
[
i
],
col2im_data
[
i
]);
}
delete
place
;
delete
context
;
}
#endif
TEST
(
math
,
im2col
)
{
testIm2col
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/operators/math/inclusive_scan.h
浏览文件 @
a821c4a9
...
...
@@ -194,7 +194,7 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
constexpr
size_t
kThreadNumY
=
32
;
size_t
grid_dim
=
(
outer_dim
+
kThreadNumY
-
1
)
/
kThreadNumY
;
grid_dim
=
std
::
min
<
size_t
>
(
grid_dim
,
dev_ctx
.
GetCUDAMaxGridDimSize
()
.
x
);
grid_dim
=
std
::
min
<
size_t
>
(
grid_dim
,
dev_ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
);
dim3
thread_dims
(
kThreadNumX
,
kThreadNumY
);
if
(
reverse
)
{
InclusiveScanInnerDimCUDAKernel
<
...
...
paddle/fluid/operators/math/math_function.cu
浏览文件 @
a821c4a9
...
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/funcs/eigen/common.h"
namespace
paddle
{
...
...
@@ -44,6 +45,18 @@ template struct SetConstant<platform::CUDADeviceContext,
template
struct
SetConstant
<
platform
::
CUDADeviceContext
,
platform
::
complex
<
double
>
>
;
template
struct
SetConstant
<
pten
::
GPUContext
,
platform
::
float16
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
platform
::
bfloat16
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
float
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
double
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
uint8_t
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
int
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
int16_t
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
int64_t
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
bool
>;
template
struct
SetConstant
<
pten
::
GPUContext
,
platform
::
complex
<
float
>
>
;
template
struct
SetConstant
<
pten
::
GPUContext
,
platform
::
complex
<
double
>
>
;
template
struct
SetConstant
<
platform
::
CUDAPinnedDeviceContext
,
platform
::
float16
>;
template
struct
SetConstant
<
platform
::
CUDAPinnedDeviceContext
,
...
...
paddle/fluid/operators/math/math_function_test.cc
浏览文件 @
a821c4a9
...
...
@@ -223,6 +223,7 @@ TEST(math_funciton, set_constant) {
t
.
Resize
({
10
,
10
});
t
.
mutable_data
<
int
>
(
paddle
::
platform
::
CPUPlace
());
auto
*
ctx
=
new
paddle
::
platform
::
CPUDeviceContext
();
ctx
->
Init
();
paddle
::
operators
::
math
::
set_constant
(
*
ctx
,
&
t
,
10
);
for
(
int64_t
i
=
0
;
i
<
t
.
numel
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
10
,
t
.
data
<
int
>
()[
i
],
...
...
paddle/fluid/operators/math/math_function_test.cu
浏览文件 @
a821c4a9
...
...
@@ -46,6 +46,10 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
float
*
input1_ptr
=
input1
.
mutable_data
<
float
>
({
2
,
3
},
cpu_place
);
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
...
...
@@ -78,6 +82,10 @@ TEST(math_function, notrans_mul_trans_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
...
...
@@ -117,6 +125,10 @@ TEST(math_function, trans_mul_notrans_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
float
*
input1_ptr
=
input1
.
mutable_data
<
float
>
({
2
,
3
},
cpu_place
);
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
...
...
@@ -155,6 +167,10 @@ TEST(math_function, trans_mul_notrans_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
...
...
@@ -200,6 +216,10 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
int
m
=
2
;
int
n
=
3
;
...
...
@@ -254,6 +274,10 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
...
...
@@ -316,6 +340,10 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
int
m
=
2
;
int
n
=
3
;
...
...
@@ -364,6 +392,10 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
...
...
@@ -418,6 +450,10 @@ void GemvTest(int m, int n, bool trans) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
T
*
data_a
=
mat_a
.
mutable_data
<
T
>
({
m
,
n
},
cpu_place
);
T
*
data_b
=
vec_b
.
mutable_data
<
T
>
({
trans
?
m
:
n
},
cpu_place
);
...
...
paddle/fluid/operators/math/vol2col_test.cc
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/vol2col.h"
#include <gtest/gtest.h>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
template
<
typename
DeviceContext
,
typename
Place
>
void
testVol2col
()
{
...
...
@@ -25,7 +27,6 @@ void testVol2col() {
auto
*
place
=
new
Place
();
DeviceContext
*
context
=
new
DeviceContext
(
*
place
);
/**
* input = [[0, 1, 2,
* 3, 4, 5]
...
...
@@ -123,6 +124,124 @@ void testVol2col() {
delete
context
;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
>
void
testVol2col
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
()
{
paddle
::
framework
::
Tensor
input
;
paddle
::
framework
::
Tensor
input_tmp
;
paddle
::
framework
::
Tensor
output
;
paddle
::
framework
::
Tensor
output_tmp
;
auto
*
place
=
new
paddle
::
platform
::
CUDAPlace
();
auto
*
context
=
new
paddle
::
platform
::
CUDADeviceContext
(
*
place
);
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
*
place
,
context
->
stream
())
.
get
());
context
->
PartialInitWithAllocator
();
/**
* input = [[0, 1, 2,
* 3, 4, 5]
* [6, 7, 8,
* 9, 10, 11]]
*
* output = [0, 1
* 1, 2
* 3, 4
* 4, 5
* 6, 7
* 7, 8
* 9, 10
* 10, 11]
*
* col2vol = [[0, 2, 2,
* 3, 8, 5]
* [6, 14, 8,
* 9, 20, 11]]
*
*/
int
input_depth
=
2
;
int
input_height
=
2
;
int
input_width
=
3
;
int
filter_size
=
2
;
std
::
vector
<
int
>
strides
({
1
,
1
,
1
});
std
::
vector
<
int
>
paddings
({
0
,
0
,
0
});
std
::
vector
<
int
>
dilations
({
1
,
1
,
1
});
int
output_depth
=
(
input_depth
-
filter_size
+
2
*
paddings
[
0
])
/
strides
[
0
]
+
1
;
int
output_height
=
(
input_height
-
filter_size
+
2
*
paddings
[
1
])
/
strides
[
1
]
+
1
;
int
output_width
=
(
input_width
-
filter_size
+
2
*
paddings
[
2
])
/
strides
[
2
]
+
1
;
// Vol2Col test
float
*
input_ptr
=
input_tmp
.
mutable_data
<
float
>
({
1
,
input_depth
,
input_height
,
input_width
},
paddle
::
platform
::
CPUPlace
());
float
arr
[
12
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
};
memcpy
(
input_ptr
,
arr
,
12
*
sizeof
(
float
));
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
paddle
::
framework
::
TensorCopySync
(
input_tmp
,
*
place
,
&
input
);
}
output
.
mutable_data
<
float
>
({
1
,
filter_size
,
filter_size
,
filter_size
,
output_depth
,
output_height
,
output_width
},
*
place
);
paddle
::
operators
::
math
::
Vol2ColFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
vol2col
;
vol2col
(
*
context
,
input
,
dilations
,
strides
,
paddings
,
&
output
);
float
vol_2_col
[]
=
{
0
,
1
,
1
,
2
,
3
,
4
,
4
,
5
,
6
,
7
,
7
,
8
,
9
,
10
,
10
,
11
};
float
*
out_cfo_ptr
;
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
out_cfo_ptr
=
output
.
data
<
float
>
();
}
else
{
paddle
::
framework
::
TensorCopySync
(
output
,
paddle
::
platform
::
CPUPlace
(),
&
output_tmp
);
out_cfo_ptr
=
output_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
16
;
++
i
)
{
EXPECT_EQ
(
out_cfo_ptr
[
i
],
vol_2_col
[
i
]);
}
// Col2Vol test
float
col_2_vol
[]
=
{
0
,
2
,
2
,
3
,
8
,
5
,
6
,
14
,
8
,
9
,
20
,
11
};
memset
(
input_ptr
,
0
,
12
*
sizeof
(
float
));
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
input
=
input_tmp
;
}
else
{
paddle
::
framework
::
TensorCopySync
(
input_tmp
,
*
place
,
&
input
);
}
paddle
::
operators
::
math
::
Col2VolFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
col2vol
;
col2vol
(
*
context
,
output
,
dilations
,
strides
,
paddings
,
&
input
);
float
*
in_ptr
;
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
in_ptr
=
input
.
data
<
float
>
();
}
else
{
paddle
::
framework
::
TensorCopySync
(
input
,
paddle
::
platform
::
CPUPlace
(),
&
input_tmp
);
in_ptr
=
input_tmp
.
data
<
float
>
();
}
for
(
int
i
=
0
;
i
<
12
;
++
i
)
{
EXPECT_EQ
(
in_ptr
[
i
],
col_2_vol
[
i
]);
}
delete
place
;
delete
context
;
}
#endif
TEST
(
math
,
vol2col
)
{
testVol2col
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/operators/mean_op.cu
浏览文件 @
a821c4a9
...
...
@@ -66,7 +66,8 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
reduce_dims
.
push_back
(
i
);
}
TensorReduceFunctorImpl
<
T
,
T
,
kernel_primitives
::
AddFunctor
,
Div
>
(
*
input
,
output
,
Div
(
numel
),
reduce_dims
,
stream
);
context
.
cuda_device_context
(),
*
input
,
output
,
Div
(
numel
),
reduce_dims
,
stream
);
}
};
...
...
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
浏览文件 @
a821c4a9
...
...
@@ -57,7 +57,12 @@ class NCCLTester : public ::testing::Test {
paddle
::
platform
::
CPUPlace
cpu_place
;
for
(
size_t
i
=
0
;
i
<
gpu_list_
.
size
();
++
i
)
{
p
::
CUDAPlace
place
(
i
);
dev_ctxs_
.
emplace_back
(
new
p
::
CUDADeviceContext
(
place
));
auto
*
ctx
=
new
p
::
CUDADeviceContext
(
place
);
ctx
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
->
stream
())
.
get
());
ctx
->
PartialInitWithAllocator
();
dev_ctxs_
.
emplace_back
(
ctx
);
}
NCCLInitOp
();
...
...
paddle/fluid/operators/p_norm_op.cu
浏览文件 @
a821c4a9
...
...
@@ -106,16 +106,20 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
using
MT
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
if
(
porder
==
0
)
{
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
NonzeroFunctor
<
T
>>
(
*
in_x
,
out_norm
,
NonzeroFunctor
<
T
>
(),
reduce_axis
,
stream
);
ctx
.
cuda_device_context
(),
*
in_x
,
out_norm
,
NonzeroFunctor
<
T
>
(),
reduce_axis
,
stream
);
}
else
if
(
porder
==
INFINITY
)
{
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
MaxFunctor
,
AbsFunctor
<
T
>>
(
*
in_x
,
out_norm
,
AbsFunctor
<
T
>
(),
reduce_axis
,
stream
);
ctx
.
cuda_device_context
(),
*
in_x
,
out_norm
,
AbsFunctor
<
T
>
(),
reduce_axis
,
stream
);
}
else
if
(
porder
==
-
INFINITY
)
{
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
MinFunctor
,
AbsFunctor
<
T
>>
(
*
in_x
,
out_norm
,
AbsFunctor
<
T
>
(),
reduce_axis
,
stream
);
ctx
.
cuda_device_context
(),
*
in_x
,
out_norm
,
AbsFunctor
<
T
>
(),
reduce_axis
,
stream
);
}
else
{
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
UnsignedPowFunctor
<
T
>>
(
*
in_x
,
out_norm
,
UnsignedPowFunctor
<
T
>
(
porder
),
reduce_axis
,
stream
);
ctx
.
cuda_device_context
(),
*
in_x
,
out_norm
,
UnsignedPowFunctor
<
T
>
(
porder
),
reduce_axis
,
stream
);
const
framework
::
Tensor
*
tmp_norm
=
out_norm
;
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
tmp_norm
};
...
...
paddle/fluid/operators/pool_op.h
浏览文件 @
a821c4a9
...
...
@@ -208,8 +208,8 @@ class PoolKernel : public framework::OpKernel<T> {
auto
stream
=
dev_ctx
.
stream
();
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
DivideFunctor
<
T
>>
(
*
in_x
,
out
,
kps
::
DivideFunctor
<
T
>
(
reduce_num
),
reduce_dim
,
stream
);
dev_ctx
,
*
in_x
,
out
,
kps
::
DivideFunctor
<
T
>
(
reduce_num
)
,
reduce_dim
,
stream
);
#else // for cpu
paddle
::
operators
::
math
::
Pool2dFunctor
<
DeviceContext
,
paddle
::
operators
::
math
::
AvgPool
<
T
>
,
T
>
...
...
paddle/fluid/operators/prelu_op.cu
浏览文件 @
a821c4a9
...
...
@@ -186,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
}
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dalpha_tmp
,
dalpha
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
context
.
cuda_device_context
(),
dalpha_tmp
,
dalpha
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
}
};
...
...
paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
浏览文件 @
a821c4a9
...
...
@@ -222,6 +222,10 @@ TEST(SENDANDRECV, GPU) {
framework
::
Scope
*
scope
=
(
*
micro_scope
)[
0
];
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
framework
::
Executor
exe
(
place
);
// create var on local scope
...
...
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
浏览文件 @
a821c4a9
...
...
@@ -39,14 +39,16 @@ namespace operators {
template
<
typename
Tx
,
typename
Ty
,
template
<
typename
>
class
ReduceOp
,
typename
TransformOp
>
void
TensorReduceFunctorImpl
(
const
framework
::
Tensor
&
x
,
framework
::
Tensor
*
y
,
void
TensorReduceFunctorImpl
(
const
platform
::
CUDADeviceContext
&
dev_ctx
,
const
framework
::
Tensor
&
x
,
framework
::
Tensor
*
y
,
const
TransformOp
&
transform
,
const
std
::
vector
<
int
>&
origin_reduce_dims
,
gpuStream_t
stream
)
{
y
->
mutable_data
<
Ty
>
(
x
.
place
());
pten
::
kernels
::
TensorReduceFunctorImpl
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
>
(
x
,
y
,
transform
,
origin_reduce_dims
,
stream
);
static_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
),
x
,
y
,
transform
,
origin_reduce_dims
,
stream
);
}
}
// namespace operators
...
...
paddle/fluid/operators/renorm_op.cu
浏览文件 @
a821c4a9
...
...
@@ -156,7 +156,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
cuda_ctx
,
ins
,
&
outs
,
func
);
std
::
vector
<
int
>
reduce_axis
=
{
0
,
2
};
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
pow_value
,
&
dim_value
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_axis
,
stream
);
cuda_ctx
,
pow_value
,
&
dim_value
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_axis
,
stream
);
RenormKernelFunc3
<
T
><<<
grid2
,
block2
,
0
,
stream
>>>
(
numel
,
dim_value
.
mutable_data
<
T
>
(
context
.
GetPlace
()),
p
,
max_norm
);
RenormKernelFunc4
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
...
...
@@ -213,10 +214,11 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
dim_divisor
);
std
::
vector
<
int
>
reduce_axis
=
{
0
,
2
};
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
pow_value
,
&
dim_value
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_axis
,
stream
);
ctx
.
cuda_device_context
(),
pow_value
,
&
dim_value
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_axis
,
stream
);
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
mul_value
,
&
weight_derivative
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_axis
,
stream
);
ctx
.
cuda_device_context
(),
mul_value
,
&
weight_derivative
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_axis
,
stream
);
RenormGradKernelFunc2
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
x_data
,
dout_data
,
dx_data
,
numel
,
dim_value
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
...
...
paddle/fluid/operators/reshape_op.cc
浏览文件 @
a821c4a9
...
...
@@ -389,7 +389,8 @@ class ReshapeKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
device_context
<
platform
::
CUDADeviceContext
>
();
pten
::
ReshapeKernel
(
dev_ctx
,
*
in
,
pt_scalar_shape
,
out
);
pten
::
ReshapeKernel
(
static_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
),
*
in
,
pt_scalar_shape
,
out
);
}
#endif
#ifdef PADDLE_WITH_XPU
...
...
@@ -417,7 +418,8 @@ class ReshapeGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
device_context
<
platform
::
CUDADeviceContext
>
();
pten
::
ReshapeGradKernel
(
dev_ctx
,
*
d_out
,
d_x
);
pten
::
ReshapeGradKernel
(
static_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
),
*
d_out
,
d_x
);
}
#endif
#ifdef PADDLE_WITH_XPU
...
...
@@ -445,7 +447,8 @@ class ReshapeDoubleGradKernel {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
device_context
<
platform
::
CUDADeviceContext
>
();
pten
::
ReshapeDoubleGradKernel
(
dev_ctx
,
*
dd_x
,
dd_out
);
pten
::
ReshapeDoubleGradKernel
(
static_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
),
*
dd_x
,
dd_out
);
}
#endif
#ifdef PADDLE_WITH_XPU
...
...
paddle/fluid/operators/scatter.cu.h
浏览文件 @
a821c4a9
...
...
@@ -183,8 +183,7 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
int64_t
max_grid_dimx
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
)
.
GetCUDAMaxGridDimSize
()
.
x
;
.
GetCUDAMaxGridDimSize
()[
0
];
int64_t
grid
=
height
<
max_grid_dimx
?
height
:
max_grid_dimx
;
ScatterInitCUDAKernel
<
T
,
IndexT
><<<
...
...
paddle/fluid/operators/solve_op.h
浏览文件 @
a821c4a9
...
...
@@ -46,7 +46,8 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
#if defined(__NVCC__) || defined(__HIPCC__)
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
*
input
,
output
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
ctx
.
cuda_device_context
(),
*
input
,
output
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
#else
ReduceKernelFunctor
<
DeviceContext
,
T
,
ops
::
SumFunctor
>
(
input
,
output
,
reduce_dims
,
keep_dim
,
false
,
ctx
)
...
...
paddle/fluid/operators/strided_memcpy_test.cc
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -86,6 +87,10 @@ TEST(StridedMemcpy, GPUCrop) {
platform
::
CPUPlace
cpu
;
platform
::
CUDADeviceContext
ctx
(
gpu0
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu0
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
auto
src_allocation
=
memory
::
Alloc
(
gpu0
,
sizeof
(
src
));
...
...
@@ -124,6 +129,10 @@ TEST(StridedMemcpy, GPUConcat) {
platform
::
CUDAPlace
gpu0
(
0
);
platform
::
CPUPlace
cpu
;
platform
::
CUDADeviceContext
ctx
(
gpu0
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu0
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
auto
gpu_src_allocation
=
memory
::
Alloc
(
gpu0
,
sizeof
(
src
));
int
*
gpu_src
=
reinterpret_cast
<
int
*>
(
gpu_src_allocation
->
ptr
());
memory
::
Copy
(
gpu0
,
gpu_src
,
cpu
,
src
,
sizeof
(
src
),
ctx
.
stream
());
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
浏览文件 @
a821c4a9
...
...
@@ -37,6 +37,10 @@ void CreateCUDATensor(framework::Scope* scope, const std::string& name,
tensor
->
Resize
(
dims
);
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
inference
::
tensorrt
::
RandomizeTensor
(
tensor
,
place
,
ctx
);
}
...
...
@@ -133,6 +137,10 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
framework
::
Scope
scope
;
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
// Prepare variables.
if
(
allow_build_at_runtime
)
CreateCUDATensor
(
&
scope
,
"x"
,
std
::
vector
<
int64_t
>
({
3
,
4
,
1
,
1
}));
...
...
@@ -159,6 +167,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework
::
Scope
scope
;
platform
::
CUDAPlace
place
;
platform
::
CUDADeviceContext
ctx
(
place
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
auto
*
block_
=
program
.
Proto
()
->
add_blocks
();
block_
->
set_idx
(
0
);
...
...
paddle/fluid/operators/top_k_function_cuda.h
浏览文件 @
a821c4a9
...
...
@@ -411,7 +411,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
};
int
block_size
=
ComputeBlockSize
(
num_cols
);
unsigned
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
unsigned
int
maxGridDimX
=
ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
// actually, int num_rows < max_grid_size
unsigned
int
grid_size
=
num_rows
<
maxGridDimX
?
static_cast
<
unsigned
int
>
(
num_rows
)
...
...
paddle/fluid/operators/trace_op.cu
浏览文件 @
a821c4a9
...
...
@@ -40,7 +40,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
std
::
vector
<
int
>
reduce_dims
;
reduce_dims
.
push_back
(
out
->
dims
().
size
());
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
diag
,
out
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
context
.
cuda_device_context
(),
diag
,
out
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
}
else
{
math
::
SetConstant
<
DeviceContext
,
T
>
functor
;
functor
(
context
.
device_context
<
DeviceContext
>
(),
out
,
static_cast
<
T
>
(
0
));
...
...
paddle/fluid/operators/triangular_solve_op.cu
浏览文件 @
a821c4a9
...
...
@@ -45,7 +45,8 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
gpuStream_t
stream
=
ctx
.
cuda_device_context
().
stream
();
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
in
,
out
,
kps
::
IdentityFunctor
<
T
>
(),
out_reduce_dims
,
stream
);
ctx
.
cuda_device_context
(),
in
,
out
,
kps
::
IdentityFunctor
<
T
>
(),
out_reduce_dims
,
stream
);
}
};
...
...
paddle/fluid/operators/viterbi_decode_op.cu
浏览文件 @
a821c4a9
...
...
@@ -148,7 +148,7 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
}
const
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
auto
cu_stream
=
dev_ctx
.
stream
();
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
.
x
;
int64_t
max_grid_dimx
=
dev_ctx
.
GetCUDAMaxGridDimSize
()
[
0
]
;
int64_t
height
=
pre
*
post
;
int64_t
width
=
n
;
int64_t
grid_size
=
height
<
max_grid_dimx
?
height
:
max_grid_dimx
;
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
a821c4a9
...
...
@@ -138,6 +138,7 @@ if(WITH_CNCL)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
target_link_libraries
(
device_context gpu_info gpu_context pten_gpu_info
)
target_link_libraries
(
device_context gpu_resource_pool
)
endif
()
...
...
paddle/fluid/platform/bfloat16_test.cu
浏览文件 @
a821c4a9
...
...
@@ -66,6 +66,10 @@ TEST(bfloat16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor
CUDAPlace
gpu_place
(
0
);
CUDADeviceContext
gpu_ctx
(
gpu_place
);
gpu_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
gpu_ctx
.
stream
())
.
get
());
gpu_ctx
.
PartialInitWithAllocator
();
framework
::
TensorCopy
(
src_tensor
,
gpu_place
,
gpu_ctx
,
&
gpu_tensor
);
// GPU LoDTensor to CPU LoDTensor
...
...
paddle/fluid/platform/collective_helper.cc
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
...
...
@@ -187,6 +188,18 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank,
int
dev_id
,
int
ring_id
)
{
std
::
unique_ptr
<
CUDADeviceContext
>
dev_ctx
(
new
CUDADeviceContext
(
CUDAPlace
(
dev_id
)));
dev_ctx
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
CUDAPlace
(
dev_id
),
dev_ctx
->
stream
())
.
get
());
dev_ctx
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
CUDAPlace
(
dev_id
))
.
get
());
dev_ctx
->
PartialInitWithAllocator
();
std
::
shared_ptr
<
platform
::
CudaEventObject
>
compute_event
(
platform
::
CudaEventResourcePool
::
Instance
().
New
(
dev_id
));
...
...
@@ -329,7 +342,7 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(BKCLContext_t comm, int nranks,
auto
*
dev_ctx
=
static_cast
<
platform
::
XPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
XPUPlace
(
dev_id
)));
dev_ctx
->
set_bkcl_c
ontext
(
comm
);
dev_ctx
->
SetBkclC
ontext
(
comm
);
}
return
comm_map_
[
ring_id
][
dev_id
].
get
();
...
...
paddle/fluid/platform/device/gpu/CMakeLists.txt
浏览文件 @
a821c4a9
IF
(
WITH_GPU
)
add_subdirectory
(
cuda
)
nv_library
(
gpu_info SRCS gpu_info.cc DEPS
cuda
_info gflags glog enforce monitor dynload_cuda
)
nv_library
(
gpu_info SRCS gpu_info.cc DEPS
pten_gpu
_info gflags glog enforce monitor dynload_cuda
)
nv_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
nv_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda
)
ELSEIF
(
WITH_ROCM
)
add_subdirectory
(
rocm
)
hip_library
(
gpu_info SRCS gpu_info.cc DEPS
rocm
_info gflags glog enforce monitor dynload_cuda
)
hip_library
(
gpu_info SRCS gpu_info.cc DEPS
pten_gpu
_info gflags glog enforce monitor dynload_cuda
)
hip_test
(
cuda_helper_test SRCS cuda_helper_test.cu
)
hip_test
(
cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda
)
...
...
paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
浏览文件 @
a821c4a9
nv_library
(
cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
nv_library
(
cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade
)
nv_library
(
cuda_profiler SRCS cuda_profiler.cc DEPS enforce
)
...
...
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
浏览文件 @
a821c4a9
...
...
@@ -14,8 +14,10 @@
#pragma once
#include <functional>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
...
...
@@ -96,8 +98,7 @@ class CublasHandleHolder {
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cublasDestroy
(
handle_
));
}
template
<
typename
Callback
>
inline
void
Call
(
Callback
&&
callback
)
const
{
inline
void
Call
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
callback
)
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
callback
(
handle_
);
}
...
...
paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
浏览文件 @
a821c4a9
...
...
@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#include <functional>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -45,8 +47,8 @@ class CusparseHandleHolder {
#endif
}
template
<
typename
Callback
>
inline
void
Call
(
Callback
&
&
callback
)
const
{
inline
void
Call
(
const
std
::
function
<
void
(
pten
::
sparseHandle_t
)
>
&
callback
)
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
callback
(
handle_
);
}
...
...
paddle/fluid/platform/device/gpu/gpu_info.cc
浏览文件 @
a821c4a9
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include <cstdlib>
#include <mutex>
#include <set>
...
...
@@ -39,11 +40,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_uint64
(
initial_gpu_memory_in_mb
);
DECLARE_uint64
(
reallocate_gpu_memory_in_mb
);
DECLARE_bool
(
enable_cublas_tensor_op_math
);
DECLARE_string
(
selected_gpus
);
DECLARE_uint64
(
gpu_memory_limit_mb
);
constexpr
static
float
fraction_reserve_gpu_memory
=
0.05
f
;
...
...
@@ -51,23 +53,6 @@ constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_GPU_MEM_STAT
;
namespace
paddle
{
namespace
platform
{
//! Get a list of device ids from environment variable or use all.
std
::
vector
<
int
>
GetSelectedDevices
()
{
// use user specified GPUs in single-node multi-process mode.
std
::
vector
<
int
>
devices
;
if
(
!
FLAGS_selected_gpus
.
empty
())
{
auto
devices_str
=
paddle
::
string
::
Split
(
FLAGS_selected_gpus
,
','
);
for
(
auto
id
:
devices_str
)
{
devices
.
push_back
(
atoi
(
id
.
c_str
()));
}
}
else
{
int
count
=
GetGPUDeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
devices
.
push_back
(
i
);
}
}
return
devices
;
}
void
GpuMemoryUsage
(
size_t
*
available
,
size_t
*
total
)
{
size_t
actual_available
,
actual_total
;
...
...
@@ -382,5 +367,91 @@ void *GetGpuBasePtr(void *ptr, int dev_id) {
return
RecordedGpuMallocHelper
::
Instance
(
dev_id
)
->
GetBasePtr
(
ptr
);
}
int
DnnVersion
()
{
return
pten
::
backends
::
gpu
::
DnnVersion
();
}
int
GetGPUDeviceCount
()
{
return
pten
::
backends
::
gpu
::
GetGPUDeviceCount
();
}
int
GetGPUComputeCapability
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetGPUComputeCapability
(
id
);
}
int
GetGPURuntimeVersion
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetGPURuntimeVersion
(
id
);
}
int
GetGPUDriverVersion
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetGPUDriverVersion
(
id
);
}
bool
TensorCoreAvailable
()
{
return
pten
::
backends
::
gpu
::
TensorCoreAvailable
();
}
int
GetGPUMultiProcessors
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetGPUMultiProcessors
(
id
);
}
int
GetGPUMaxThreadsPerMultiProcessor
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetGPUMaxThreadsPerMultiProcessor
(
id
);
}
int
GetGPUMaxThreadsPerBlock
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetGPUMaxThreadsPerBlock
(
id
);
}
int
GetCurrentDeviceId
()
{
return
pten
::
backends
::
gpu
::
GetCurrentDeviceId
();
}
std
::
array
<
int
,
3
>
GetGpuMaxGridDimSize
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetGpuMaxGridDimSize
(
id
);
}
std
::
vector
<
int
>
GetSelectedDevices
()
{
return
pten
::
backends
::
gpu
::
GetSelectedDevices
();
}
const
gpuDeviceProp
&
GetDeviceProperties
(
int
id
)
{
return
pten
::
backends
::
gpu
::
GetDeviceProperties
(
id
);
}
void
SetDeviceId
(
int
device_id
)
{
pten
::
backends
::
gpu
::
SetDeviceId
(
device_id
);
}
gpuError_t
GpuGetLastError
()
{
return
pten
::
backends
::
gpu
::
GpuGetLastError
();
}
void
GpuStreamSync
(
gpuStream_t
stream
)
{
pten
::
backends
::
gpu
::
GpuStreamSync
(
stream
);
}
void
GpuDestroyStream
(
gpuStream_t
stream
)
{
pten
::
backends
::
gpu
::
GpuDestroyStream
(
stream
);
}
void
GpuDeviceSync
()
{
pten
::
backends
::
gpu
::
GpuDeviceSync
();
}
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
,
gpuStream_t
stream
)
{
pten
::
backends
::
gpu
::
GpuMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
);
}
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
)
{
pten
::
backends
::
gpu
::
GpuMemcpySync
(
dst
,
src
,
count
,
kind
);
}
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
,
gpuStream_t
stream
)
{
pten
::
backends
::
gpu
::
GpuMemcpyPeerAsync
(
dst
,
dst_device
,
src
,
src_device
,
count
,
stream
);
}
void
GpuMemcpyPeerSync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
)
{
pten
::
backends
::
gpu
::
GpuMemcpyPeerSync
(
dst
,
dst_device
,
src
,
src_device
,
count
);
}
void
GpuMemsetAsync
(
void
*
dst
,
int
value
,
size_t
count
,
gpuStream_t
stream
)
{
pten
::
backends
::
gpu
::
GpuMemsetAsync
(
dst
,
value
,
count
,
stream
);
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/gpu/gpu_info.h
浏览文件 @
a821c4a9
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h>
#include <array>
#include <string>
#include <vector>
...
...
@@ -52,7 +53,7 @@ int GetGPUMaxThreadsPerBlock(int id);
int
GetCurrentDeviceId
();
//! Get the maximum GridDim size for GPU buddy allocator.
dim3
GetGpuMaxGridDimSize
(
int
);
std
::
array
<
int
,
3
>
GetGpuMaxGridDimSize
(
int
);
//! Get a list of device ids from environment variable or use all.
std
::
vector
<
int
>
GetSelectedDevices
();
...
...
@@ -110,7 +111,7 @@ void GpuStreamSync(gpuStream_t stream);
void
GpuDestroyStream
(
gpuStream_t
stream
);
// ! Blocks until device has completed all operations.
void
GpuDeviceync
();
void
GpuDevice
S
ync
();
//! CudaMalloc with recorded info
gpuError_t
RecordedGpuMalloc
(
void
**
ptr
,
size_t
size
,
int
dev_id
);
...
...
paddle/fluid/platform/device/gpu/nccl_helper.h
浏览文件 @
a821c4a9
...
...
@@ -83,8 +83,21 @@ struct NCCLContext {
std
::
unique_ptr
<
CUDADeviceContext
>
ctx_
;
ncclComm_t
comm_
;
explicit
NCCLContext
(
int
dev_id
)
:
ctx_
(
new
CUDADeviceContext
(
CUDAPlace
(
dev_id
))),
comm_
{
nullptr
}
{}
explicit
NCCLContext
(
int
dev_id
)
:
comm_
{
nullptr
}
{
ctx_
.
reset
(
new
CUDADeviceContext
(
CUDAPlace
(
dev_id
)));
ctx_
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
CUDAPlace
(
dev_id
),
ctx_
->
stream
())
.
get
());
ctx_
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
ctx_
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
CUDAPlace
(
dev_id
))
.
get
());
ctx_
->
PartialInitWithAllocator
();
}
gpuStream_t
stream
()
const
{
return
ctx_
->
stream
();
}
ncclComm_t
comm
()
const
{
return
comm_
;
}
...
...
paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
浏览文件 @
a821c4a9
hip_library
(
rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda
)
hip_test
(
miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda
)
paddle/fluid/platform/device_context.cc
浏览文件 @
a821c4a9
...
...
@@ -10,8 +10,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include <functional>
#include <memory>
#include <set>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
...
...
@@ -149,14 +153,15 @@ inline void EmplaceDeviceContext(
cuda_ctx
,
platform
::
errors
::
InvalidArgument
(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext."
));
dev_ctx
->
SetDeviceAllocator
(
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
p
,
cuda_ctx
->
context
()
->
RawStream
())
// Note: A trick method to init context, why GetAllocator interface
// needs a stream parameter?
dev_ctx
->
SetAllocator
(
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
p
,
cuda_ctx
->
stream
())
.
get
());
cuda_ctx
->
PartialInitWithAllocator
();
#endif
}
else
{
dev_ctx
->
SetDeviceAllocator
(
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
->
SetAllocator
(
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
p
)
.
get
());
}
...
...
@@ -251,14 +256,18 @@ DeviceContextPool::DeviceContextPool(
}
}
CPUDeviceContext
::
CPUDeviceContext
()
:
pten
::
CPUContext
()
{}
CPUDeviceContext
::
CPUDeviceContext
()
:
pten
::
CPUContext
()
{
pten
::
CPUContext
::
Init
();
}
CPUDeviceContext
::
CPUDeviceContext
(
CPUPlace
place
)
:
pten
::
CPUContext
()
{}
CPUDeviceContext
::
CPUDeviceContext
(
CPUPlace
place
)
:
pten
::
CPUContext
(
place
)
{
pten
::
CPUContext
::
Init
();
}
#ifdef PADDLE_WITH_IPU
IPUDeviceContext
::
IPUDeviceContext
(
IPUPlace
place
)
:
place_
(
place
)
{}
Place
IPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
const
Place
&
IPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
void
IPUDeviceContext
::
Wait
()
const
{
/*! \brief Wait for all operations completion in the stream. */
...
...
@@ -268,11 +277,14 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext
::
XPUDeviceContext
()
:
pten
::
XPUContext
()
{}
XPUDeviceContext
::
XPUDeviceContext
()
:
pten
::
XPUContext
()
{
pten
::
XPUContext
::
Init
();
}
XPUDeviceContext
::~
XPUDeviceContext
()
{}
XPUDeviceContext
::
XPUDeviceContext
(
XPUPlace
place
)
:
pten
::
XPUContext
(
place
)
{
pten
::
XPUContext
::
Init
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Please NOTE: xpu device: "
<<
static_cast
<
int
>
(
place
.
device
);
}
...
...
@@ -302,7 +314,7 @@ void NPUDeviceContext::Wait() const {
aclrtStream
NPUDeviceContext
::
stream
()
const
{
return
stream_
->
raw_stream
();
}
Place
NPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
const
Place
&
NPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
aclrtContext
NPUDeviceContext
::
context
()
const
{
return
context_
;
}
...
...
@@ -319,7 +331,7 @@ Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return
eigen_device_
.
get
();
}
Place
NPUPinnedDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
const
Place
&
NPUPinnedDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
#endif
...
...
@@ -470,102 +482,28 @@ CUDAContext::~CUDAContext() {
#endif
}
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
CUDADeviceGuard
guard
(
place_
.
device
);
compute_capability_
=
GetGPUComputeCapability
(
place_
.
device
);
multi_process_
=
GetGPUMultiProcessors
(
place_
.
device
);
max_threads_per_mp_
=
GetGPUMaxThreadsPerMultiProcessor
(
place_
.
device
);
max_grid_dim_size_
=
GetGpuMaxGridDimSize
(
place_
.
device
);
max_threads_per_block_
=
GetGPUMaxThreadsPerBlock
(
place_
.
device
);
driver_version_
=
GetGPUDriverVersion
(
place_
.
device
);
runtime_version_
=
GetGPURuntimeVersion
(
place_
.
device
);
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Please NOTE: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
", GPU Compute Capability: "
<<
compute_capability_
/
10
<<
"."
<<
compute_capability_
%
10
<<
", Driver API Version: "
<<
driver_version_
/
1000
<<
"."
<<
(
driver_version_
%
100
)
/
10
<<
", Runtime API Version: "
<<
runtime_version_
/
1000
<<
"."
<<
(
runtime_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
size_t
version_major
,
version_minor
,
version_patch
;
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenGetVersion
(
&
version_major
,
&
version_minor
,
&
version_patch
));
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
", MIOpen Version: "
<<
version_major
<<
"."
<<
version_minor
<<
"."
<<
version_patch
;
#else
size_t
cudnn_dso_ver
=
dynload
::
cudnnGetVersion
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
", cuDNN Version: "
<<
cudnn_dso_ver
/
1000
<<
"."
<<
(
cudnn_dso_ver
%
1000
)
/
100
<<
"."
;
#endif
{
// Check CUDA/CUDNN version compatiblity
auto
local_cuda_version
=
(
driver_version_
/
1000
)
*
10
+
(
driver_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
auto
compile_cuda_version
=
(
HIP_VERSION
/
100
)
*
10
+
(
HIP_VERSION
%
10
);
#else
auto
compile_cuda_version
=
(
CUDA_VERSION
/
1000
)
*
10
+
(
CUDA_VERSION
%
100
)
/
10
;
#endif
if
(
local_cuda_version
<
compile_cuda_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
". The installed Paddle is compiled with CUDA "
<<
compile_cuda_version
/
10
<<
"."
<<
compile_cuda_version
%
10
<<
", but CUDA runtime version in your machine is "
<<
local_cuda_version
/
10
<<
"."
<<
local_cuda_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible CUDA "
"version."
;
}
}
default_ctx_
.
reset
(
new
CUDAContext
(
place_
));
}
CUDADeviceContext
::~
CUDADeviceContext
()
{
SetDeviceId
(
place_
.
device
);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nccl_comm_
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
ncclCommDestroy
(
nccl_comm_
));
}
#endif
}
Place
CUDADeviceContext
::
GetPlace
()
const
{
return
place_
;
}
void
CUDADeviceContext
::
Wait
()
const
{
context
()
->
Stream
()
->
Wait
();
}
int
CUDADeviceContext
::
GetComputeCapability
()
const
{
return
compute_capability_
;
}
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
return
multi_process_
*
max_threads_per_mp_
;
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
pten
::
GPUContext
(
place
)
{
pten
::
GPUContext
::
PartialInitWithoutAllocator
();
cuda_stream_
.
reset
(
new
stream
::
CUDAStream
(
pten
::
GPUContext
::
stream
(),
this
->
GetPlace
()));
}
int
CUDADeviceContext
::
GetSMCount
()
const
{
return
multi_process_
;
}
int
CUDADeviceContext
::
GetMaxThreadsPerBlock
()
const
{
return
max_threads_per_block_
;
}
CUDADeviceContext
::~
CUDADeviceContext
()
=
default
;
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
return
context
()
->
EigenDevice
().
get
();
}
return
pten
::
GPUContext
::
eigen_device
();
}
bool
CUDADeviceContext
::
tensor_core_available
()
const
{
return
context
()
->
CublasTensorCoreHandle
()
!=
nullptr
;
}
dim3
CUDADeviceContext
::
GetCUDAMaxGridDimSize
()
const
{
return
max_grid_dim_size_
;
void
CUDADeviceContext
::
Wait
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
context
()
->
Stream
()
->
Wait
();
return
;
}
pten
::
GPUContext
::
Wait
()
;
}
#ifdef PADDLE_WITH_HIP
...
...
@@ -573,33 +511,96 @@ miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
cudnnHandle_t
CUDADeviceContext
::
cudnn_handle
()
const
{
#endif
if
(
thread_ctx_
.
count
(
this
))
{
return
context
()
->
CudnnHandle
();
}
return
pten
::
GPUContext
::
cudnn_handle
();
}
#ifdef PADDLE_WITH_HIP
rocblas_handle
CUDADeviceContext
::
cublas_handle
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
return
context
()
->
CublasHandle
()
->
GetCublasHandle
();
}
return
pten
::
GPUContext
::
cublas_handle
();
}
#else
cublasHandle_t
CUDADeviceContext
::
cublas_handle
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
return
context
()
->
CublasHandle
()
->
GetCublasHandle
();
}
return
pten
::
GPUContext
::
cublas_handle
();
}
cusparseHandle_t
CUDADeviceContext
::
cusparse_handle
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
return
context
()
->
CusparseHandle
()
->
GetCusparseHandle
();
}
return
pten
::
GPUContext
::
cusparse_handle
();
}
cusolverDnHandle_t
CUDADeviceContext
::
cusolver_dn_handle
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
return
context
()
->
CusolverDnHandle
();
}
return
pten
::
GPUContext
::
cusolver_dn_handle
();
}
#endif
void
CUDADeviceContext
::
RecordEvent
(
gpuEvent_t
ev
,
const
std
::
function
<
void
()
>&
callback
)
const
{
if
(
thread_ctx_
.
count
(
this
))
{
context
()
->
Stream
()
->
RecordEvent
(
ev
,
callback
);
return
;
}
pten
::
GPUContext
::
RecordEvent
(
ev
,
callback
);
}
void
CUDADeviceContext
::
AddStreamCallback
(
const
std
::
function
<
void
()
>&
callback
)
const
{
if
(
thread_ctx_
.
count
(
this
))
{
context
()
->
Stream
()
->
AddCallback
(
callback
);
return
;
}
pten
::
GPUContext
::
AddStreamCallback
(
callback
);
}
void
CUDADeviceContext
::
WaitStreamCallback
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
context
()
->
Stream
()
->
WaitCallback
();
return
;
}
pten
::
GPUContext
::
WaitStreamCallback
();
}
CudnnWorkspaceHandle
CUDADeviceContext
::
cudnn_workspace_handle
()
const
{
return
CudnnWorkspaceHandle
(
*
this
,
&
cudnn_handle_mtx_
);
}
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
CUDADeviceContext
::
cusolver_dn_handle
()
const
{
return
context
()
->
CusolverDnHandle
();
gpuStream_t
CUDADeviceContext
::
stream
()
const
{
if
(
thread_ctx_
.
count
(
this
))
{
return
context
()
->
RawStream
();
}
return
pten
::
GPUContext
::
stream
();
}
std
::
shared_ptr
<
CUDAContext
>
CUDADeviceContext
::
context
()
const
{
if
(
!
thread_ctx_
.
count
(
this
))
{
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"CUDADeviceContext call context() failed, make sure in the "
"thread_local semantic."
));
}
return
thread_ctx_
.
at
(
this
);
}
stream
::
CUDAStream
*
CUDADeviceContext
::
GetCudaStream
()
const
{
return
cuda_stream_
.
get
();
}
#endif
gpuStream_t
CUDADeviceContext
::
stream
()
const
{
return
context
()
->
RawStream
();
}
stream
::
CUDAStream
*
CUDADeviceContext
::
SetCudaStream
(
stream
::
CUDAStream
*
new_stream_ptr
)
{
auto
*
old_stream_ptr
=
cuda_stream_
.
release
();
cuda_stream_
.
reset
(
new_stream_ptr
);
return
old_stream_ptr
;
}
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
...
...
@@ -614,7 +615,7 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
return
eigen_device_
.
get
();
}
Place
CUDAPinnedDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
const
Place
&
CUDAPinnedDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
#endif
#ifdef PADDLE_WITH_MKLDNN
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
a821c4a9
...
...
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
...
...
@@ -18,7 +19,9 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/core/device_context.h"
#include "paddle/fluid/memory/malloc.h"
...
...
@@ -28,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/fluid/platform/dynload/cusparse.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
...
...
@@ -38,6 +42,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#include "paddle/pten/backends/gpu/gpu_context.h" // NOLINT
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
...
...
@@ -145,7 +150,7 @@ class IPUDeviceContext : public DeviceContext {
explicit
IPUDeviceContext
(
IPUPlace
place
);
virtual
~
IPUDeviceContext
();
Eigen
::
DefaultDevice
*
eigen_device
()
const
{
return
nullptr
;
}
Place
GetPlace
()
const
override
;
const
Place
&
GetPlace
()
const
override
;
/*! \brief Wait for all operations completion in the stream. */
void
Wait
()
const
override
;
...
...
@@ -187,7 +192,7 @@ class NPUDeviceContext : public DeviceContext {
explicit
NPUDeviceContext
(
NPUPlace
place
);
virtual
~
NPUDeviceContext
();
Eigen
::
DefaultDevice
*
eigen_device
()
const
{
return
nullptr
;
}
Place
GetPlace
()
const
override
;
const
Place
&
GetPlace
()
const
override
;
aclrtContext
context
()
const
;
/*! \brief Wait for all operations completion in the stream. */
...
...
@@ -247,7 +252,7 @@ class NPUPinnedDeviceContext : public DeviceContext {
NPUPinnedDeviceContext
();
explicit
NPUPinnedDeviceContext
(
NPUPinnedPlace
place
);
Place
GetPlace
()
const
override
;
const
Place
&
GetPlace
()
const
override
;
Eigen
::
DefaultDevice
*
eigen_device
()
const
;
...
...
@@ -326,20 +331,20 @@ class CUDAContext {
#endif
/*! \brief Call cublas function safely. */
template
<
typename
Callback
>
inline
void
CublasCall
(
Callback
&
&
callback
)
const
{
inline
void
CublasCall
(
const
std
::
function
<
void
(
blasHandle_t
)
>
&
callback
)
const
{
if
(
cublas_tf32_tensor_core_handle_
)
{
cublas_tf32_tensor_core_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
)
);
cublas_tf32_tensor_core_handle_
->
Call
(
callback
);
}
else
{
cublas_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
)
);
cublas_handle_
->
Call
(
callback
);
}
}
#ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */
template
<
typename
Callback
>
inline
void
CusparseCall
(
Callback
&
&
callback
)
const
{
cusparse_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
)
);
inline
void
CusparseCall
(
const
std
::
function
<
void
(
pten
::
sparseHandle_t
)
>
&
callback
)
const
{
cusparse_handle_
->
Call
(
callback
);
}
#endif
...
...
@@ -348,12 +353,12 @@ class CUDAContext {
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template
<
typename
Callback
>
inline
void
TensorCoreCublasCallIfAvailable
(
Callback
&
&
callback
)
const
{
inline
void
TensorCoreCublasCallIfAvailable
(
const
std
::
function
<
void
(
blasHandle_t
)
>
&
callback
)
const
{
if
(
cublas_tensor_core_handle_
)
{
cublas_tensor_core_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
)
);
cublas_tensor_core_handle_
->
Call
(
callback
);
}
else
{
cublas_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
)
);
cublas_handle_
->
Call
(
callback
);
}
}
...
...
@@ -491,7 +496,7 @@ class CUDAContext {
DISABLE_COPY_AND_ASSIGN
(
CUDAContext
);
};
class
CUDADeviceContext
:
public
Device
Context
{
class
CUDADeviceContext
:
public
pten
::
GPU
Context
{
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
virtual
~
CUDADeviceContext
();
...
...
@@ -499,49 +504,40 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */
void
Wait
()
const
override
;
/*! \brief Return place in the device context. */
Place
GetPlace
()
const
override
;
/*! \brief Return compute capability in the device context. */
int
GetComputeCapability
()
const
;
/*! \brief Return the max physical thread count in the device context */
int
GetMaxPhysicalThreadCount
()
const
;
/*! \brief Return the SM count in the device context */
int
GetSMCount
()
const
;
/*! \brief Return the Max thread num of block in the device context */
int
GetMaxThreadsPerBlock
()
const
;
/*! \brief Return the max grid dim size in the device context */
dim3
GetCUDAMaxGridDimSize
()
const
;
/*! \brief Return eigen device in the device context. */
Eigen
::
GpuDevice
*
eigen_device
()
const
;
/*! \brief Call cublas function safely. */
template
<
typename
Callback
>
inline
void
CublasCall
(
Callback
&&
callback
)
const
{
inline
void
CublasCall
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
callback
)
const
{
if
(
!
thread_ctx_
.
count
(
this
))
{
pten
::
GPUContext
::
CublasCall
(
callback
);
return
;
}
return
context
()
->
CublasCall
(
callback
);
}
#ifndef PADDLE_WITH_HIP
/*! \brief Call cusparse function safely. */
template
<
typename
Callback
>
inline
void
CusparseCall
(
Callback
&&
callback
)
const
{
return
context
()
->
CusparseCall
(
callback
);
inline
void
CusparseCall
(
const
std
::
function
<
void
(
pten
::
sparseHandle_t
)
>&
callback
)
const
{
if
(
!
thread_ctx_
.
count
(
this
))
{
pten
::
GPUContext
::
CusparseCall
(
callback
);
return
;
}
context
()
->
CusparseCall
(
callback
);
}
#endif
/*! \brief Check whether tensor core is supported */
bool
tensor_core_available
()
const
;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template
<
typename
Callback
>
inline
void
TensorCoreCublasCallIfAvailable
(
Callback
&&
callback
)
const
{
return
context
()
->
TensorCoreCublasCallIfAvailable
(
callback
);
inline
void
TensorCoreCublasCallIfAvailable
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
callback
)
const
{
if
(
!
thread_ctx_
.
count
(
this
))
{
pten
::
GPUContext
::
TensorCoreCublasCallIfAvailable
(
callback
);
return
;
}
context
()
->
TensorCoreCublasCallIfAvailable
(
callback
);
}
/*! \brief Return cudnn handle in the device context. */
...
...
@@ -559,6 +555,10 @@ class CUDADeviceContext : public DeviceContext {
cusparseHandle_t
cusparse_handle
()
const
;
#endif
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
cusolver_dn_handle
()
const
;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
...
...
@@ -568,60 +568,33 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */
CudnnWorkspaceHandle
cudnn_workspace_handle
()
const
;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t
cusolver_dn_handle
()
const
;
#endif
/*! \brief Return cuda stream in the device context. */
gpuStream_t
stream
()
const
;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
/*! \brief Return nccl communicators. */
ncclComm_t
nccl_comm
()
const
{
return
nccl_comm_
;
}
/*! \brief Set nccl communicators. */
void
set_nccl_comm
(
ncclComm_t
comm
)
{
nccl_comm_
=
comm
;
}
#endif
template
<
typename
Callback
>
void
RecordEvent
(
gpuEvent_t
ev
,
Callback
callback
)
const
{
return
context
()
->
Stream
()
->
RecordEvent
(
ev
,
callback
);
}
template
<
typename
Callback
>
void
AddStreamCallback
(
Callback
&&
callback
)
const
{
return
context
()
->
Stream
()
->
AddCallback
(
callback
);
}
void
RecordEvent
(
gpuEvent_t
ev
,
const
std
::
function
<
void
()
>&
callback
)
const
;
void
WaitStreamCallback
()
const
{
return
context
()
->
Stream
()
->
WaitCallback
();
}
void
AddStreamCallback
(
const
std
::
function
<
void
()
>&
callback
)
const
;
void
ResetDefaultContext
(
const
stream
::
Priority
&
priority
)
{
default_ctx_
.
reset
(
new
CUDAContext
(
place_
,
priority
));
}
void
WaitStreamCallback
()
const
;
void
ResetThreadContext
(
const
stream
::
Priority
&
priority
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
ctx_mtx_
);
thread_ctx_
[
this
].
reset
(
new
CUDAContext
(
place_
,
priority
));
thread_ctx_
[
this
].
reset
(
new
CUDAContext
(
this
->
GetPlace
()
,
priority
));
}
std
::
shared_ptr
<
CUDAContext
>
context
()
const
{
if
(
!
thread_ctx_
.
count
(
this
))
{
return
default_ctx_
;
}
return
thread_ctx_
.
at
(
this
);
}
std
::
shared_ptr
<
CUDAContext
>
context
()
const
;
// Note: Can only be used under thread_local semantics.
void
SetThreadLocalStream
(
const
gpuStream_t
stream
)
{
thread_ctx_
.
at
(
this
)
->
SetStream
(
stream
);
}
private:
CUDAPlace
place_
;
std
::
shared_ptr
<
CUDAContext
>
default_ctx_
;
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
stream
::
CUDAStream
*
GetCudaStream
()
const
;
stream
::
CUDAStream
*
SetCudaStream
(
stream
::
CUDAStream
*
);
private:
// The thread_local static variable will be released before the
// global static variable, so avoid using it in dtor.
static
thread_local
std
::
unordered_map
<
const
CUDADeviceContext
*
,
...
...
@@ -631,22 +604,9 @@ class CUDADeviceContext : public DeviceContext {
mutable
std
::
mutex
cudnn_handle_mtx_
;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t
nccl_comm_
{
nullptr
};
#endif
int
compute_capability_
;
int
runtime_version_
;
int
driver_version_
;
int
multi_process_
;
int
max_threads_per_mp_
;
int
max_threads_per_block_
;
dim3
max_grid_dim_size_
;
// NOTE: Just for compatibility with the past, please delete if there is an
// elegant way.
std
::
unique_ptr
<
stream
::
CUDAStream
>
cuda_stream_
;
DISABLE_COPY_AND_ASSIGN
(
CUDADeviceContext
);
};
...
...
@@ -711,7 +671,7 @@ class CUDAPinnedDeviceContext : public DeviceContext {
CUDAPinnedDeviceContext
();
explicit
CUDAPinnedDeviceContext
(
CUDAPinnedPlace
place
);
Place
GetPlace
()
const
override
;
const
Place
&
GetPlace
()
const
override
;
Eigen
::
DefaultDevice
*
eigen_device
()
const
;
...
...
paddle/fluid/platform/device_context_test.cu
浏览文件 @
a821c4a9
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
TEST
(
Device
,
Init
)
{
using
paddle
::
platform
::
DeviceContext
;
...
...
@@ -26,6 +27,20 @@ TEST(Device, Init) {
int
count
=
paddle
::
platform
::
GetGPUDeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
CUDADeviceContext
*
device_context
=
new
CUDADeviceContext
(
CUDAPlace
(
i
));
device_context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
CUDAPlace
(
i
),
device_context
->
stream
())
.
get
());
device_context
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
device_context
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
CUDAPlace
(
i
))
.
get
());
device_context
->
PartialInitWithAllocator
();
Eigen
::
GpuDevice
*
gpu_device
=
device_context
->
eigen_device
();
ASSERT_NE
(
nullptr
,
gpu_device
);
delete
device_context
;
...
...
@@ -39,6 +54,19 @@ TEST(Device, CUDADeviceContext) {
int
count
=
paddle
::
platform
::
GetGPUDeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
CUDADeviceContext
*
device_context
=
new
CUDADeviceContext
(
CUDAPlace
(
i
));
device_context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
CUDAPlace
(
i
),
device_context
->
stream
())
.
get
());
device_context
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
device_context
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
CUDAPlace
(
i
))
.
get
());
device_context
->
PartialInitWithAllocator
();
Eigen
::
GpuDevice
*
gpu_device
=
device_context
->
eigen_device
();
ASSERT_NE
(
nullptr
,
gpu_device
);
#ifdef PADDLE_WITH_HIP
...
...
paddle/fluid/platform/device_event_gpu.cc
浏览文件 @
a821c4a9
...
...
@@ -53,7 +53,7 @@ void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
platform
::
errors
::
PreconditionNotMet
(
"Failed to dynamic_cast context into CUDADeviceContext."
));
wrapper
->
inner_event_
.
Record
(
*
cuda_dev_ctx
->
context
()
->
S
tream
());
wrapper
->
inner_event_
.
Record
(
cuda_dev_ctx
->
s
tream
());
}
bool
DeviceEventQueryCUDA
(
const
DeviceEvent
*
event
)
{
...
...
@@ -82,8 +82,7 @@ void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
platform
::
errors
::
PreconditionNotMet
(
"Failed to dynamic_cast context into CUDADeviceContext."
));
// calling cudaStreamWaitEvent(stream, event, 0)
cuda_dev_ctx
->
context
()
->
Stream
()
->
WaitEvent
(
wrapper
->
inner_event_
.
GetRawCudaEvent
());
cuda_dev_ctx
->
WaitEvent
(
wrapper
->
inner_event_
.
GetRawCudaEvent
());
}
void
DeviceEventCPUWaitCUDA
(
const
DeviceEvent
*
event
,
...
...
paddle/fluid/platform/device_event_test.cc
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/platform/device_event.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/place.h"
using
::
paddle
::
platform
::
kCUDA
;
using
::
paddle
::
platform
::
kCPU
;
...
...
@@ -38,9 +39,11 @@ TEST(DeviceEvent, CUDA) {
// case 1. test for event_creator
DeviceEvent
event
(
place
);
ASSERT_NE
(
event
.
GetEvent
().
get
(),
nullptr
);
bool
status
=
event
.
Query
();
ASSERT_EQ
(
status
,
true
);
// case 2. test for event_recorder
event
.
Record
(
context
);
bool
status
=
event
.
Query
();
status
=
event
.
Query
();
ASSERT_EQ
(
status
,
false
);
// case 3. test for event_finisher
event
.
Finish
();
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
a821c4a9
...
...
@@ -539,7 +539,7 @@ inline void retry_sleep(unsigned milliseconds) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time);
\
paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);
\
__cond__ = (COND); \
++retry_count; \
} \
...
...
@@ -727,7 +727,7 @@ inline void retry_sleep(unsigned millisecond) {
::paddle::platform::details::ExternalApiType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time);
\
::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);
\
__cond__ = (COND); \
++retry_count; \
} \
...
...
paddle/fluid/platform/event.h
浏览文件 @
a821c4a9
...
...
@@ -152,11 +152,11 @@ class CudaEvent {
#endif
}
void
Record
(
const
paddle
::
platform
::
stream
::
CUDAStream
&
stream
)
{
void
Record
(
gpuStream_t
stream
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
event_
,
stream
.
raw_stream
()
));
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
event_
,
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
event_
,
stream
.
raw_stream
()
));
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
event_
,
stream
));
#endif
}
...
...
paddle/fluid/platform/float16_test.cu
浏览文件 @
a821c4a9
...
...
@@ -328,6 +328,10 @@ TEST(float16, lod_tensor_on_gpu) {
// CPU LoDTensor to GPU LoDTensor
CUDAPlace
gpu_place
(
0
);
CUDADeviceContext
gpu_ctx
(
gpu_place
);
gpu_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
gpu_ctx
.
stream
())
.
get
());
gpu_ctx
.
PartialInitWithAllocator
();
framework
::
TensorCopy
(
src_tensor
,
gpu_place
,
gpu_ctx
,
&
gpu_tensor
);
// GPU LoDTensor to CPU LoDTensor
...
...
paddle/fluid/platform/for_range.h
浏览文件 @
a821c4a9
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -72,6 +73,7 @@ __global__ static void ForRangeElemwiseOp(Function func, size_t limit) {
}
}
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template
<
>
struct
ForRange
<
CUDADeviceContext
>
{
ForRange
(
const
CUDADeviceContext
&
dev_ctx
,
size_t
limit
)
...
...
@@ -106,6 +108,40 @@ struct ForRange<CUDADeviceContext> {
size_t
limit_
;
};
template
<
>
struct
ForRange
<
pten
::
GPUContext
>
{
ForRange
(
const
pten
::
GPUContext
&
dev_ctx
,
size_t
limit
)
:
dev_ctx_
(
dev_ctx
),
limit_
(
static_cast
<
size_t
>
(
limit
))
{}
template
<
typename
Function
>
inline
void
operator
()(
Function
func
)
const
{
#ifdef __HIPCC__
// HIP will throw core dump when threads > 256
constexpr
int
num_threads
=
256
;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
int
num_thread
=
256
;
platform
::
ChangeThreadNum
(
dev_ctx_
,
&
num_thread
,
128
);
const
int
num_threads
=
num_thread
;
#else
constexpr
int
num_threads
=
1024
;
#endif
size_t
block_size
=
limit_
<=
num_threads
?
limit_
:
num_threads
;
size_t
grid_size
=
(
limit_
+
num_threads
-
1
)
/
num_threads
;
if
(
grid_size
==
1
)
{
ForRangeElemwiseOpGridIsOne
<<<
1
,
block_size
,
0
,
dev_ctx_
.
stream
()
>>>
(
func
);
}
else
{
ForRangeElemwiseOp
<<<
grid_size
,
block_size
,
0
,
dev_ctx_
.
stream
()
>>>
(
func
,
limit_
);
}
}
const
pten
::
GPUContext
&
dev_ctx_
;
size_t
limit_
;
};
#endif
}
// namespace platform
...
...
paddle/fluid/platform/stream/cuda_stream.cc
浏览文件 @
a821c4a9
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -116,11 +117,8 @@ CUDAStream* get_current_stream(int deviceId) {
platform
::
Place
device
=
CUDAPlace
(
deviceId
);
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
device
))
->
context
()
->
Stream
()
.
get
();
return
stream
;
return
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
device
))
->
GetCudaStream
();
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream."
));
...
...
@@ -133,12 +131,12 @@ CUDAStream* set_current_stream(CUDAStream* stream) {
auto
&
device
=
stream
->
GetPlace
();
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
return
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
device
))
->
context
()
->
SetStream
(
stream
);
->
SetCudaStream
(
stream
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with CUDA. Cannot visit cuda current stream."
));
return
nullptr
;
"Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream."
));
return
CUDAStream
(
nullptr
);
#endif
}
}
// namespace stream
...
...
paddle/fluid/platform/stream/cuda_stream.h
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <cstdint>
#include <functional>
#include <memory>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
...
...
@@ -51,24 +52,28 @@ class CUDAStream final {
const
StreamFlag
&
flag
=
StreamFlag
::
kDefaultFlag
)
{
Init
(
place
,
priority
,
flag
);
}
explicit
CUDAStream
(
gpuStream_t
stream
,
const
Place
&
place
)
:
place_
(
place
),
stream_
(
stream
)
{
owned_stream_
=
false
;
callback_manager_
.
reset
(
new
StreamCallbackManager
<
gpuStream_t
>
(
stream_
));
}
virtual
~
CUDAStream
()
{
Destroy
();
}
bool
Init
(
const
Place
&
place
,
const
Priority
&
priority
=
Priority
::
kNormal
,
const
StreamFlag
&
flag
=
StreamFlag
::
kDefaultFlag
);
template
<
typename
Callback
>
void
AddCallback
(
Callback
&&
callback
)
const
{
void
AddCallback
(
std
::
function
<
void
()
>
callback
)
const
{
callback_manager_
->
AddCallback
(
callback
);
}
template
<
typename
Callback
>
#ifdef PADDLE_WITH_HIP
void
RecordEvent
(
hipEvent_t
ev
,
Callback
callback
)
const
{
void
RecordEvent
(
hipEvent_t
ev
,
const
std
::
function
<
void
()
>&
callback
)
const
{
callback
();
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
ev
,
stream_
));
}
#else
void
RecordEvent
(
cudaEvent_t
ev
,
Callback
callback
)
const
{
void
RecordEvent
(
cudaEvent_t
ev
,
const
std
::
function
<
void
()
>&
callback
)
const
{
callback
();
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
ev
,
stream_
));
}
...
...
@@ -149,6 +154,7 @@ class CUDAStream final {
};
CUDAStream
*
get_current_stream
(
int
deviceId
);
// NOTE: There is a problem with the interface and needs to be fixed
CUDAStream
*
set_current_stream
(
CUDAStream
*
stream
);
}
// namespace stream
...
...
paddle/fluid/platform/transform_test.cu
浏览文件 @
a821c4a9
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/transform.h"
...
...
@@ -57,6 +58,10 @@ TEST(Transform, CPUUnary) {
TEST
(
Transform
,
GPUUnary
)
{
CUDAPlace
gpu0
(
0
);
CUDADeviceContext
ctx
(
gpu0
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu0
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
float
cpu_buf
[
4
]
=
{
0.1
,
0.2
,
0.3
,
0.4
};
auto
gpu_allocation
=
Alloc
(
gpu0
,
sizeof
(
float
)
*
4
);
float
*
gpu_buf
=
static_cast
<
float
*>
(
gpu_allocation
->
ptr
());
...
...
@@ -84,6 +89,10 @@ TEST(Transform, GPUBinary) {
int
buf
[
4
]
=
{
1
,
2
,
3
,
4
};
CUDAPlace
gpu0
(
0
);
CUDADeviceContext
ctx
(
gpu0
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu0
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
auto
gpu_allocation
=
Alloc
(
gpu0
,
sizeof
(
buf
));
int
*
gpu_buf
=
static_cast
<
int
*>
(
gpu_allocation
->
ptr
());
Copy
(
gpu0
,
gpu_buf
,
CPUPlace
(),
buf
,
sizeof
(
buf
),
ctx
.
stream
());
...
...
paddle/fluid/pybind/cuda_streams_py.cc
浏览文件 @
a821c4a9
...
...
@@ -34,7 +34,7 @@ void BindCudaStream(py::module *m_ptr) {
return
paddle
::
platform
::
stream
::
get_current_stream
(
deviceId
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with CUDA. Cannot visit cuda current
"
"Paddle is not compiled with CUDA. Cannot visit cuda current"
"stream."
));
#endif
},
...
...
@@ -119,7 +119,7 @@ void BindCudaStream(py::module *m_ptr) {
[](
paddle
::
platform
::
stream
::
CUDAStream
&
self
,
paddle
::
platform
::
stream
::
CUDAStream
&
stream
)
{
paddle
::
platform
::
CudaEvent
event
;
event
.
Record
(
stream
);
event
.
Record
(
stream
.
raw_stream
()
);
self
.
WaitEvent
(
event
.
GetRawCudaEvent
());
},
...
...
@@ -179,7 +179,7 @@ void BindCudaStream(py::module *m_ptr) {
if
(
event
==
nullptr
)
{
event
=
new
paddle
::
platform
::
CudaEvent
();
}
event
->
Record
(
self
);
event
->
Record
(
self
.
raw_stream
()
);
return
event
;
},
R"DOC(
...
...
@@ -321,7 +321,7 @@ void BindCudaStream(py::module *m_ptr) {
if
(
stream
==
nullptr
)
{
stream
=
paddle
::
platform
::
stream
::
get_current_stream
(
-
1
);
}
self
.
Record
(
*
stream
);
self
.
Record
(
stream
->
raw_stream
()
);
},
R"DOC(
Records the event in the given stream.
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
a821c4a9
...
...
@@ -1596,7 +1596,20 @@ All parameter, weight, gradient are variables in Paddle.
.
def_static
(
"create"
,
[](
paddle
::
platform
::
CPUPlace
&
place
)
->
paddle
::
platform
::
DeviceContext
*
{
return
new
paddle
::
platform
::
CPUDeviceContext
();
auto
*
context
=
new
paddle
::
platform
::
CPUDeviceContext
();
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
)
.
get
());
context
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
context
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
place
)
.
get
());
return
context
;
})
.
def_static
(
"create"
,
[](
paddle
::
platform
::
XPUPlace
&
place
)
...
...
@@ -1607,7 +1620,20 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."
));
#else
return
new
paddle
::
platform
::
XPUDeviceContext
(
place
);
auto
*
context
=
new
paddle
::
platform
::
XPUDeviceContext
(
place
);
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
)
.
get
());
context
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
context
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
place
)
.
get
());
return
context
;
#endif
})
.
def_static
(
"create"
,
...
...
@@ -1643,7 +1669,21 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."
));
#else
return
new
paddle
::
platform
::
CUDADeviceContext
(
place
);
auto
*
context
=
new
paddle
::
platform
::
CUDADeviceContext
(
place
);
context
->
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
place
,
context
->
stream
())
.
get
());
context
->
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
context
->
SetZeroAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetZeroAllocator
(
place
)
.
get
());
context
->
PartialInitWithAllocator
();
return
context
;
#endif
})
.
def_static
(
"create"
,
...
...
paddle/pten/backends/CMakeLists.txt
浏览文件 @
a821c4a9
...
...
@@ -2,6 +2,10 @@ add_subdirectory(dynload)
add_subdirectory
(
cpu
)
if
(
WITH_GPU OR WITH_ROCM
)
add_subdirectory
(
gpu
)
endif
()
if
(
WITH_XPU
)
add_subdirectory
(
xpu
)
endif
()
...
...
@@ -11,3 +15,7 @@ cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context)
if
(
WITH_XPU
)
add_dependencies
(
pten_context xpu_context
)
endif
()
if
(
WITH_GPU
)
add_dependencies
(
pten_context gpu_context
)
endif
()
paddle/pten/backends/cpu/cpu_context.cc
浏览文件 @
a821c4a9
...
...
@@ -15,75 +15,59 @@
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "paddle/pten/core/device_context.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace
pten
{
struct
CPUContext
::
CPU
Impl
{
CPUImpl
()
{
device_
=
new
Eigen
::
DefaultDevice
();
}
struct
CPUContext
::
Impl
{
Impl
()
:
place_
(
CPUPlace
())
{
}
// Users need to manage external resources.
explicit
CPUImpl
(
const
CPUContextResource
&
ctx_res
)
:
res_
(
ctx_res
)
{
device_
=
res_
.
device
;
}
explicit
Impl
(
const
Place
&
place
)
:
place_
(
place
)
{}
~
CPUImpl
()
{
if
(
res_
.
device
==
nullptr
&&
device_
!=
nullptr
)
{
delete
device_
;
device_
=
nullptr
;
~
Impl
()
{
if
(
owned_
)
{
delete
eigen_device_
;
}
}
Eigen
::
DefaultDevice
*
GetEigenDevice
()
const
{
PD_CHECK
(
device_
!=
nullptr
,
"the eigen_device is nullptr."
)
;
return
device_
;
void
Init
()
{
owned_
=
true
;
eigen_device_
=
new
Eigen
::
DefaultDevice
()
;
}
void
SetEigenDevice
(
Eigen
::
DefaultDevice
*
device
)
{
if
(
device
==
nullptr
)
{
return
;
}
res_
.
device
=
device
;
device_
=
device
;
Eigen
::
DefaultDevice
*
GetEigenDevice
()
const
{
PD_CHECK
(
eigen_device_
!=
nullptr
,
"the cpu eigen_device is nullptr."
);
return
eigen_device_
;
}
Place
GetPlace
()
const
{
return
place_
;
}
Eigen
::
DefaultDevice
*
device_
{
nullptr
};
CPUContextResource
res_
;
CPUPlace
place_
;
bool
owned_
{
false
};
Eigen
::
DefaultDevice
*
eigen_device_
{
nullptr
};
Place
place_
;
};
CPUContext
::
CPUContext
()
:
DeviceContext
()
{
cpu_impl_
=
std
::
make_unique
<
CPUImpl
>
();
}
CPUContext
::
CPUContext
(
const
CPUContext
&
other
)
:
DeviceContext
()
{
cpu_impl_
=
std
::
make_unique
<
CPUImpl
>
();
cpu_impl_
->
SetEigenDevice
(
other
.
eigen_device
());
}
CPUContext
::
CPUContext
()
:
DeviceContext
(),
impl_
(
std
::
make_unique
<
CPUContext
::
Impl
>
())
{}
CPUContext
::
CPUContext
(
CPUContext
&&
other
)
:
DeviceContext
()
{
cpu_impl_
=
std
::
move
(
other
.
cpu_impl_
);
}
CPUContext
::
CPUContext
(
const
Place
&
place
)
:
DeviceContext
(),
impl_
(
std
::
make_unique
<
CPUContext
::
Impl
>
(
place
))
{}
CPUContext
::~
CPUContext
()
=
default
;
CPUContext
::
CPUContext
(
const
CPUContextResource
&
ctx_res
)
:
DeviceContext
()
{
cpu_impl_
=
std
::
make_unique
<
CPUImpl
>
(
ctx_res
);
}
void
CPUContext
::
Init
()
{
impl_
->
Init
();
}
Eigen
::
DefaultDevice
*
CPUContext
::
eigen_device
()
const
{
return
cpu_
impl_
->
GetEigenDevice
();
return
impl_
->
GetEigenDevice
();
}
const
Place
&
CPUContext
::
GetPlace
()
const
{
return
impl_
->
place_
;
}
void
CPUContext
::
SetEigenDevice
(
Eigen
::
DefaultDevice
*
device
)
{
cpu_impl_
->
SetEigenDevice
(
device
)
;
impl_
->
eigen_device_
=
device
;
}
Place
CPUContext
::
GetPlace
()
const
{
return
cpu_impl_
->
GetPlace
();
}
}
// namespace pten
paddle/pten/backends/cpu/cpu_context.h
浏览文件 @
a821c4a9
...
...
@@ -24,37 +24,29 @@ limitations under the License. */
namespace
pten
{
struct
CPUContextResource
{
Eigen
::
DefaultDevice
*
device
{
nullptr
};
};
class
CPUContext
:
public
DeviceContext
{
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
CPUContext
();
// NOTE: Share the same underlying resources, please ensure that resources are
// not released.
CPUContext
(
const
CPUContext
&
);
CPUContext
(
CPUContext
&&
);
~
CPUContext
();
explicit
CPUContext
(
const
Place
&
);
virtual
~
CPUContext
();
Eigen
::
DefaultDevice
*
eigen_device
()
const
;
// TODO(wilber): Whether the interface should be preserved.
Place
GetPlace
()
const
override
;
const
Place
&
GetPlace
()
const
override
;
public:
// NOTE: External users manage resources. Used in inference scenarios.
explicit
CPUContext
(
const
CPUContextResource
&
ctx_res
);
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void
Init
();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void
SetEigenDevice
(
Eigen
::
DefaultDevice
*
device
);
private:
struct
CPU
Impl
;
std
::
unique_ptr
<
CPUImpl
>
cpu_
impl_
;
struct
Impl
;
std
::
unique_ptr
<
Impl
>
impl_
;
};
}
// namespace pten
paddle/pten/backends/gpu/CMakeLists.txt
0 → 100644
浏览文件 @
a821c4a9
if
(
WITH_GPU
)
add_subdirectory
(
cuda
)
nv_library
(
pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda
)
elseif
(
WITH_ROCM
)
add_subdirectory
(
rocm
)
hip_library
(
pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda
)
endif
()
cc_library
(
gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3
)
paddle/pten/backends/gpu/cuda/CMakeLists.txt
0 → 100644
浏览文件 @
a821c4a9
nv_library
(
pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda
)
paddle/pten/backends/gpu/cuda/cuda_helper.h
0 → 100644
浏览文件 @
a821c4a9
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace
pten
{
namespace
backends
{
namespace
gpu
{
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
}
// namespace gpu
}
// namespace backends
}
// namespace pten
paddle/
fluid/platform/device
/gpu/cuda/cuda_info.cc
→
paddle/
pten/backends
/gpu/cuda/cuda_info.cc
浏览文件 @
a821c4a9
// Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
...
...
@@ -12,20 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static
std
::
once_flag
g_device_props_size_init_flag
;
static
std
::
vector
<
std
::
unique_ptr
<
std
::
once_flag
>>
g_device_props_init_flags
;
static
std
::
vector
<
paddle
::
gpuDeviceProp
>
g_device_props
;
static
std
::
vector
<
pten
::
gpuDeviceProp
>
g_device_props
;
namespace
pten
{
namespace
backends
{
namespace
gpu
{
namespace
paddle
{
namespace
platform
{
int
DnnVersion
()
{
if
(
!
dynload
::
HasCUDNN
())
return
-
1
;
return
dynload
::
cudnnGetVersion
();
...
...
@@ -75,11 +74,13 @@ int GetGPUDeviceCount() {
}
int
GetGPUComputeCapability
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
major
,
minor
;
auto
major_error_code
=
cudaDeviceGetAttribute
(
&
major
,
cudaDevAttrComputeCapabilityMajor
,
id
);
...
...
@@ -92,22 +93,26 @@ int GetGPUComputeCapability(int id) {
}
int
GetGPURuntimeVersion
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
runtime_version
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaRuntimeGetVersion
(
&
runtime_version
));
return
runtime_version
;
}
int
GetGPUDriverVersion
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
driver_version
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDriverGetVersion
(
&
driver_version
));
return
driver_version
;
...
...
@@ -120,11 +125,13 @@ bool TensorCoreAvailable() {
}
int
GetGPUMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
count
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceGetAttribute
(
&
count
,
cudaDevAttrMultiProcessorCount
,
id
));
...
...
@@ -132,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
}
int
GetGPUMaxThreadsPerMultiProcessor
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
count
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceGetAttribute
(
&
count
,
cudaDevAttrMaxThreadsPerMultiProcessor
,
id
));
...
...
@@ -145,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
}
int
GetGPUMaxThreadsPerBlock
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
count
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceGetAttribute
(
&
count
,
cudaDevAttrMaxThreadsPerBlock
,
id
));
...
...
@@ -162,32 +173,34 @@ int GetCurrentDeviceId() {
return
device_id
;
}
dim3
GetGpuMaxGridDimSize
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
std
::
array
<
int
,
3
>
GetGpuMaxGridDimSize
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
dim3
ret
;
id
,
GetGPUDeviceCount
()));
std
::
array
<
int
,
3
>
ret
;
int
size
;
auto
error_code_x
=
cudaDeviceGetAttribute
(
&
size
,
cudaDevAttrMaxGridDimX
,
id
);
PADDLE_ENFORCE_GPU_SUCCESS
(
error_code_x
);
ret
.
x
=
size
;
ret
[
0
]
=
size
;
auto
error_code_y
=
cudaDeviceGetAttribute
(
&
size
,
cudaDevAttrMaxGridDimY
,
id
);
PADDLE_ENFORCE_GPU_SUCCESS
(
error_code_y
);
ret
.
y
=
size
;
ret
[
1
]
=
size
;
auto
error_code_z
=
cudaDeviceGetAttribute
(
&
size
,
cudaDevAttrMaxGridDimZ
,
id
);
PADDLE_ENFORCE_GPU_SUCCESS
(
error_code_z
);
ret
.
z
=
size
;
ret
[
2
]
=
size
;
return
ret
;
}
const
gpuDeviceProp
&
GetDeviceProperties
(
int
id
)
{
std
::
call_once
(
g_device_props_size_init_flag
,
[
&
]
{
int
gpu_num
=
0
;
gpu_num
=
platform
::
GetGPUDeviceCount
();
gpu_num
=
GetGPUDeviceCount
();
g_device_props_init_flags
.
resize
(
gpu_num
);
g_device_props
.
resize
(
gpu_num
);
for
(
int
i
=
0
;
i
<
gpu_num
;
++
i
)
{
...
...
@@ -196,16 +209,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
});
if
(
id
==
-
1
)
{
id
=
platform
::
GetCurrentDeviceId
();
id
=
GetCurrentDeviceId
();
}
if
(
id
<
0
||
id
>=
static_cast
<
int
>
(
g_device_props
.
size
()))
{
PADDLE_THROW
(
platform
::
errors
::
OutOfRange
(
PADDLE_THROW
(
p
addle
::
p
latform
::
errors
::
OutOfRange
(
"The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!"
,
id
,
static_cast
<
int
>
(
g_device_props
.
size
()),
id
,
static_cast
<
int
>
(
g_device_props
.
size
()),
static_cast
<
int
>
(
g_device_props
.
size
())));
}
...
...
@@ -219,32 +233,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void
SetDeviceId
(
int
id
)
{
// TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
PADDLE_RETRY_CUDA_SUCCESS
(
cudaSetDevice
(
id
));
}
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
,
gpuStream_t
stream
)
{
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
,
gpuStream_t
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
));
}
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpy
(
dst
,
src
,
count
,
kind
));
}
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
,
gpuStream_t
stream
)
{
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
,
gpuStream_t
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpyPeerAsync
(
dst
,
dst_device
,
src
,
src_device
,
count
,
stream
));
}
void
GpuMemcpyPeerSync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
)
{
void
GpuMemcpyPeerSync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpyPeer
(
dst
,
dst_device
,
src
,
src_device
,
count
));
}
...
...
@@ -264,5 +289,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void
GpuDeviceSync
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceSynchronize
());
}
gpuError_t
GpuGetLastError
()
{
return
cudaGetLastError
();
}
}
// namespace platform
}
// namespace paddle
}
// namespace gpu
}
// namespace backends
}
// namespace pten
paddle/pten/backends/gpu/forwards.h
0 → 100644
浏览文件 @
a821c4a9
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Forward-declares CUDA API types used in platform-agnostic wrapper headers.
#pragma once
/// Forward declaration of Eigen types.
namespace
Eigen
{
struct
GpuDevice
;
}
// namespace Eigen
/// Forward declaration of CUDA types.
// Forward declaration of CUDA runtime types.
using
cudaStream_t
=
struct
CUstream_st
*
;
using
cudaEvent_t
=
struct
CUevent_st
*
;
// Forward declaration of cuDNN types.
using
cudnnHandle_t
=
struct
cudnnContext
*
;
using
cudnnTensorDescriptor_t
=
struct
cudnnTensorStruct
*
;
using
cudnnConvolutionDescriptor_t
=
struct
cudnnConvolutionStruct
*
;
using
cudnnPoolingDescriptor_t
=
struct
cudnnPoolingStruct
*
;
using
cudnnFilterDescriptor_t
=
struct
cudnnFilterStruct
*
;
using
cudnnLRNDescriptor_t
=
struct
cudnnLRNStruct
*
;
using
cudnnActivationDescriptor_t
=
struct
cudnnActivationStruct
*
;
using
cudnnSpatialTransformerDescriptor_t
=
struct
cudnnSpatialTransformerStruct
*
;
using
cudnnOpTensorDescriptor_t
=
struct
cudnnOpTensorStruct
*
;
using
cudnnReduceTensorDescriptor_t
=
struct
cudnnReduceTensorStruct
*
;
using
cudnnCTCLossDescriptor_t
=
struct
cudnnCTCLossStruct
*
;
using
cudnnTensorTransformDescriptor_t
=
struct
cudnnTensorTransformStruct
*
;
using
cudnnDropoutDescriptor_t
=
struct
cudnnDropoutStruct
*
;
using
cudnnRNNDescriptor_t
=
struct
cudnnRNNStruct
*
;
using
cudnnPersistentRNNPlan_t
=
struct
cudnnPersistentRNNPlan
*
;
using
cudnnRNNDataDescriptor_t
=
struct
cudnnRNNDataStruct
*
;
using
cudnnAlgorithmDescriptor_t
=
struct
cudnnAlgorithmStruct
*
;
using
cudnnAlgorithmPerformance_t
=
struct
cudnnAlgorithmPerformanceStruct
*
;
using
cudnnSeqDataDescriptor_t
=
struct
cudnnSeqDataStruct
*
;
using
cudnnAttnDescriptor_t
=
struct
cudnnAttnStruct
*
;
using
cudnnFusedOpsConstParamPack_t
=
struct
cudnnFusedOpsConstParamStruct
*
;
using
cudnnFusedOpsVariantParamPack_t
=
struct
cudnnFusedOpsVariantParamStruct
*
;
using
cudnnFusedOpsPlan_t
=
struct
cudnnFusedOpsPlanStruct
*
;
// Forward declaration of cuBLAS types.
using
cublasHandle_t
=
struct
cublasContext
*
;
// Forward declaration of cuSOLVER types.
using
cusolverDnHandle_t
=
struct
cusolverDnContext
*
;
// Forward declaration of cuSparse types.
using
cusparseHandle_t
=
struct
cusparseContext
*
;
// Forward declaration of cuFFT types.
using
cufftHandle
=
int
;
// Forward declaration of NCCL types.
using
ncclComm_t
=
struct
ncclComm
*
;
/// Forward declaration of ROCM types.
#include <cstddef>
using
hipDevice_t
=
int
;
using
hipCtx_t
=
struct
ihipCtx_t
*
;
using
hipModule_t
=
struct
ihipModule_t
*
;
using
hipStream_t
=
struct
ihipStream_t
*
;
using
hipEvent_t
=
struct
ihipEvent_t
*
;
using
hipFunction_t
=
struct
ihipModuleSymbol_t
*
;
// Forward declaration of MIOpen types.
using
miopenHandle_t
=
struct
miopenHandle
*
;
using
miopenAcceleratorQueue_t
=
hipStream_t
;
using
miopenFusionOpDescriptor_t
=
struct
miopenFusionOpDescriptor
*
;
using
miopenTensorDescriptor_t
=
struct
miopenTensorDescriptor
*
;
using
miopenConvolutionDescriptor_t
=
struct
miopenConvolutionDescriptor
*
;
using
miopenPoolingDescriptor_t
=
struct
miopenPoolingDescriptor
*
;
using
miopenLRNDescriptor_t
=
struct
miopenLRNDescriptor
*
;
using
miopenActivationDescriptor_t
=
struct
miopenActivationDescriptor
*
;
using
miopenRNNDescriptor_t
=
struct
miopenRNNDescriptor
*
;
using
miopenCTCLossDescriptor_t
=
struct
miopenCTCLossDescriptor
*
;
using
miopenDropoutDescriptor_t
=
struct
miopenDropoutDescriptor
*
;
using
miopenFusionPlanDescriptor_t
=
struct
miopenFusionPlanDescriptor
*
;
using
miopenOperatorDescriptor_t
=
struct
miopenOperatorDescriptor
*
;
using
miopenOperatorArgs_t
=
struct
miopenOperatorArgs
*
;
using
miopenAllocatorFunction
=
void
*
(
*
)(
void
*
context
,
size_t
sizeBytes
);
// using miopenDeallocatorFunction = void *(*)(void *context, void *memory);
// struct miopenConvAlgoPerf_t;
// struct miopenConvSolution_t;
// Forward declaration of rocBLAS types.
using
rocblas_handle
=
struct
_rocblas_handle
*
;
// Forward declaration of hipfft types.
using
hipfftHandle
=
struct
hipfftHandle_t
*
;
// Forward declaration of rocSOLVER types.
using
rocsolver_handle
=
rocblas_handle
;
// Forward declaration of rocSparse types.
using
rocsparse_handle
=
struct
_rocsparse_handle
*
;
paddle/pten/backends/gpu/gpu_context.cc
0 → 100644
浏览文件 @
a821c4a9
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_context.h"
#include <array>
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_info.h"
#include "paddle/pten/common/float16.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/allocator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/pten/backends/dynload/cudnn.h"
#include "paddle/pten/backends/dynload/cusolver.h"
#include "paddle/pten/backends/dynload/cusparse.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#include "paddle/pten/backends/dynload/nccl.h"
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/pten/backends/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/pten/backends/dynload/rccl.h"
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#endif // PADDLE_WITH_HIP
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "unsupported/Eigen/CXX11/Tensor"
// TODO(pten): remove fluid header.
#include "paddle/fluid/platform/enforce.h"
namespace
pten
{
namespace
internal
{
class
EigenGpuStreamDevice
:
public
Eigen
::
StreamInterface
{
public:
EigenGpuStreamDevice
()
:
scratch_
(
nullptr
),
semaphore_
(
nullptr
)
{
Eigen
::
initializeDeviceProp
();
}
~
EigenGpuStreamDevice
()
override
{}
void
Reinitialize
(
gpuStream_t
cuda_stream
,
Allocator
*
allocator
,
GPUPlace
place
)
{
stream_
=
cuda_stream
;
place_
=
place
;
allocator_
=
allocator
;
device_prop_
=
&
Eigen
::
m_deviceProperties
[
place
.
device
];
}
const
gpuStream_t
&
stream
()
const
override
{
return
stream_
;
}
const
gpuDeviceProp
&
deviceProperties
()
const
override
{
return
*
device_prop_
;
}
void
*
allocate
(
size_t
num_bytes
)
const
override
{
if
(
UNLIKELY
(
num_bytes
==
0
))
{
return
nullptr
;
}
auto
buf
=
allocator_
->
Allocate
(
num_bytes
);
VLOG
(
4
)
<<
"Eigen allocated at "
<<
buf
->
ptr
()
<<
" requested "
<<
num_bytes
;
void
*
retv
=
buf
->
ptr
();
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
allocations_
.
emplace
(
retv
,
std
::
move
(
buf
));
}
return
retv
;
}
void
deallocate
(
void
*
buffer
)
const
override
{
if
(
LIKELY
(
buffer
))
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
allocations_
.
erase
(
buffer
);
}
}
void
*
scratchpad
()
const
override
{
if
(
scratch_
==
NULL
)
{
scratch_
=
allocate
(
Eigen
::
kGpuScratchSize
+
sizeof
(
unsigned
int
));
}
return
scratch_
;
}
unsigned
int
*
semaphore
()
const
override
{
if
(
semaphore_
==
NULL
)
{
char
*
scratch
=
static_cast
<
char
*>
(
scratchpad
())
+
Eigen
::
kGpuScratchSize
;
semaphore_
=
reinterpret_cast
<
unsigned
int
*>
(
scratch
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
stream_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemsetAsync
(
semaphore_
,
0
,
sizeof
(
unsigned
int
),
stream_
));
#endif
}
return
semaphore_
;
}
private:
GPUPlace
place_
;
gpuStream_t
stream_
;
// not owned;
Allocator
*
allocator_
;
// not owned;
const
gpuDeviceProp
*
device_prop_
;
// not owned;
mutable
void
*
scratch_
;
mutable
unsigned
int
*
semaphore_
;
mutable
std
::
mutex
mtx_
;
// to protect allocations_
mutable
std
::
unordered_map
<
void
*
,
Allocator
::
AllocationPtr
>
allocations_
;
};
#ifdef PADDLE_WITH_HIP
static
void
StreamCallbackFunc
(
gpuStream_t
stream
,
gpuError_t
status
,
void
*
user_data
)
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
static
void
CUDART_CB
StreamCallbackFunc
(
void
*
user_data
)
#else
static
void
CUDART_CB
StreamCallbackFunc
(
cudaStream_t
stream
,
cudaError_t
status
,
void
*
user_data
)
#endif
#endif
{
std
::
unique_ptr
<
std
::
function
<
void
()
>>
func
(
reinterpret_cast
<
std
::
function
<
void
()
>*>
(
user_data
));
(
*
func
)();
}
}
// namespace internal
class
DnnWorkspaceHandle
{
public:
explicit
inline
DnnWorkspaceHandle
(
Allocator
*
allocator
)
:
allocator_
(
allocator
)
{}
inline
void
RunFunc
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
size_t
required_workspace_bytes
)
{
if
(
required_workspace_bytes
>
WorkspaceSize
())
{
ReallocWorkspace
(
required_workspace_bytes
);
}
VLOG
(
2
)
<<
"Cudnn workspace size at RunFunc: "
<<
static_cast
<
double
>
(
WorkspaceSize
())
/
(
1
<<
20
)
<<
" MB"
;
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
cudnn_func
(
allocation_
?
allocation_
->
ptr
()
:
nullptr
);
}
}
/*! \brief Thread which call RunFuncSync() would release gpu memory after
* running the function. Currently this function is only used when cudnn
* exhaustive searching and callers have to guarantee that the input function
* is host blocking */
inline
void
RunFuncSync
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
size_t
required_workspace_bytes
)
{
RunFunc
(
cudnn_func
,
required_workspace_bytes
);
ResetWorkspace
();
}
inline
size_t
WorkspaceSize
()
{
if
(
allocation_
==
nullptr
)
{
return
0
;
}
return
allocation_
->
size
();
}
void
ResetWorkspace
()
{
allocation_
=
nullptr
;
}
void
ReallocWorkspace
(
size_t
required_workspace_bytes
)
{
if
(
required_workspace_bytes
<=
WorkspaceSize
())
return
;
// reset allocation first before re-allocate to save memory
allocation_
.
reset
();
allocation_
=
allocator_
->
Allocate
(
required_workspace_bytes
);
}
private:
Allocator
::
AllocationPtr
allocation_
{
nullptr
};
Allocator
*
allocator_
{
nullptr
};
std
::
mutex
mtx_
;
};
struct
GPUContext
::
Impl
{
void
Init
()
{
owned_
=
true
;
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
InitGpuProperties
();
InitStream
();
InitEigenDevice
();
InitBlasHandle
();
InitDNNHandle
();
InitSolverHandle
();
InitSparseHandle
();
InitDnnWorkspace
();
}
void
PartialInitWithoutAllocator
()
{
owned_
=
true
;
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
InitGpuProperties
();
InitStream
();
InitBlasHandle
();
InitDNNHandle
();
InitSolverHandle
();
InitSparseHandle
();
}
void
PartialInitWithAllocator
()
{
owned_
=
true
;
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
InitEigenDevice
();
InitDnnWorkspace
();
}
Impl
()
:
place_
(
GPUPlace
())
{}
explicit
Impl
(
const
GPUPlace
&
place
)
:
place_
(
place
)
{}
~
Impl
()
{
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
device
);
DestoryInternalWorkspace
();
DestoryInternalEigenDevice
();
DestroyInternalSparseHandle
();
DestroyInternalSolverHandle
();
DestroyInternalDnnHandle
();
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nccl_comm_
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
ncclCommDestroy
(
nccl_comm_
));
}
#endif
DestroyInternalBlasHandle
();
DestoryInternalStream
();
}
const
Place
&
GetPlace
()
const
{
return
place_
;
}
bool
IsTensorCoreAvailable
()
const
{
return
blas_tensor_core_handle_
!=
nullptr
;
}
void
InitGpuProperties
()
{
backends
::
gpu
::
GPUDeviceGuard
guard
(
place_
.
GetDeviceId
());
compute_capability_
=
backends
::
gpu
::
GetGPUComputeCapability
(
place_
.
GetDeviceId
());
multi_process_
=
backends
::
gpu
::
GetGPUMultiProcessors
(
place_
.
GetDeviceId
());
max_threads_per_mp_
=
backends
::
gpu
::
GetGPUMaxThreadsPerMultiProcessor
(
place_
.
GetDeviceId
());
max_grid_dim_size_
=
backends
::
gpu
::
GetGpuMaxGridDimSize
(
place_
.
GetDeviceId
());
max_threads_per_block_
=
backends
::
gpu
::
GetGPUMaxThreadsPerBlock
(
place_
.
GetDeviceId
());
driver_version_
=
backends
::
gpu
::
GetGPUDriverVersion
(
place_
.
GetDeviceId
());
runtime_version_
=
backends
::
gpu
::
GetGPURuntimeVersion
(
place_
.
GetDeviceId
());
// TODO(wilber): glog may be replaced in the future?
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Please NOTE: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
", GPU Compute Capability: "
<<
compute_capability_
/
10
<<
"."
<<
compute_capability_
%
10
<<
", Driver API Version: "
<<
driver_version_
/
1000
<<
"."
<<
(
driver_version_
%
100
)
/
10
<<
", Runtime API Version: "
<<
runtime_version_
/
1000
<<
"."
<<
(
runtime_version_
%
100
)
/
10
;
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
cudnn_dso_ver
=
(
miopen_major
*
1000
+
miopen_minor
*
10
+
miopen_patch
)
/
10
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
10
;
if
(
cudnn_dso_ver
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
100
<<
"."
<<
compile_miopen_version
%
100
<<
", but MIOPEN version in your machine is "
<<
cudnn_dso_ver
/
100
<<
"."
<<
cudnn_dso_ver
%
100
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
#else
size_t
cudnn_dso_ver
=
dynload
::
cudnnGetVersion
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
", cuDNN Version: "
<<
cudnn_dso_ver
/
1000
<<
"."
<<
(
cudnn_dso_ver
%
1000
)
/
100
<<
"."
;
// Check CUDA/CUDNN version compatiblity
auto
local_cuda_version
=
(
driver_version_
/
1000
)
*
10
+
(
driver_version_
%
100
)
/
10
;
auto
compile_cuda_version
=
(
CUDA_VERSION
/
1000
)
*
10
+
(
CUDA_VERSION
%
100
)
/
10
;
if
(
local_cuda_version
<
compile_cuda_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
static_cast
<
int
>
(
place_
.
device
)
<<
". The installed Paddle is compiled with CUDA "
<<
compile_cuda_version
/
10
<<
"."
<<
compile_cuda_version
%
10
<<
", but CUDA runtime version in your machine is "
<<
local_cuda_version
/
10
<<
"."
<<
local_cuda_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible CUDA "
"version."
;
}
#endif
}
void
InitDnnWorkspace
()
{
PD_CHECK
(
allocator_
!=
nullptr
,
"the device allocator for gpu context is nullptr."
);
workspace_
=
new
DnnWorkspaceHandle
(
allocator_
);
}
void
DestoryInternalWorkspace
()
{
if
(
owned_
&&
workspace_
!=
nullptr
)
{
delete
workspace_
;
stream_
=
nullptr
;
}
}
DnnWorkspaceHandle
*
GetDnnWorkspace
()
{
PD_CHECK
(
workspace_
!=
nullptr
,
"the gpu cudnn workspace is nullptr."
);
return
workspace_
;
}
void
InitStream
()
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreateWithPriority
(
&
stream_
,
hipStreamDefault
,
0
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreateWithPriority
(
&
stream_
,
cudaStreamDefault
,
0
));
#endif
}
void
DestoryInternalStream
()
{
if
(
owned_
&&
stream_
!=
nullptr
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
stream_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
stream_
));
#endif
}
stream_
=
nullptr
;
}
void
SetStream
(
gpuStream_t
stream
)
{
stream_
=
stream
;
}
gpuStream_t
GetStream
()
const
{
PD_CHECK
(
stream_
!=
nullptr
,
"the gpu stream is nullptr."
);
return
stream_
;
}
void
InitEigenDevice
()
{
PD_CHECK
(
allocator_
!=
nullptr
,
"the allocator for eigen device is nullptr."
);
eigen_stream_
.
reset
(
new
internal
::
EigenGpuStreamDevice
());
eigen_stream_
->
Reinitialize
(
stream_
,
allocator_
,
place_
);
eigen_device_
=
new
Eigen
::
GpuDevice
(
eigen_stream_
.
get
());
}
void
DestoryInternalEigenDevice
()
{
if
(
owned_
&&
eigen_device_
!=
nullptr
)
{
delete
eigen_device_
;
eigen_device_
=
nullptr
;
}
}
void
SetEigenDevice
(
Eigen
::
GpuDevice
*
device
)
{
eigen_device_
=
device
;
}
Eigen
::
GpuDevice
*
eigen_device
()
const
{
PD_CHECK
(
eigen_device_
!=
nullptr
,
"the gpu eigen_device is nullptr."
);
return
eigen_device_
;
}
void
InitBlasHandle
()
{
#ifdef PADDLE_WITH_HIP
pten
::
dynload
::
rocblas_create_handle
(
&
blas_handle_
);
pten
::
dynload
::
rocblas_set_stream
(
blas_handle_
,
stream_
);
#else // PADDLE_WITH_CUDA
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasCreate
(
&
blas_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasSetStream
(
blas_handle_
,
stream_
));
#if CUDA_VERSION >= 9000
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasCreate
(
&
blas_tensor_core_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasSetStream
(
blas_tensor_core_handle_
,
stream_
));
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasSetMathMode
(
blas_tensor_core_handle_
,
CUBLAS_TENSOR_OP_MATH
));
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasCreate
(
&
blas_tf32_tensor_core_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasSetStream
(
blas_tf32_tensor_core_handle_
,
stream_
));
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cublasSetMathMode
(
blas_tf32_tensor_core_handle_
,
CUBLAS_TF32_TENSOR_OP_MATH
));
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 9000
#endif // PADDLE_WITH_HIP
}
void
DestroyInternalBlasHandle
()
{
#ifdef PADDLE_WITH_HIP
if
(
owned_
&&
blas_handle_
!=
nullptr
)
{
pten
::
dynload
::
rocblas_destroy_handle
(
blas_handle_
);
blas_handle_
=
nullptr
;
}
#else
if
(
owned_
&&
blas_handle_
!=
nullptr
)
{
pten
::
dynload
::
cublasDestroy
(
blas_handle_
);
blas_handle_
=
nullptr
;
}
if
(
owned_
&&
blas_tensor_core_handle_
!=
nullptr
)
{
pten
::
dynload
::
cublasDestroy
(
blas_tensor_core_handle_
);
blas_tensor_core_handle_
=
nullptr
;
}
if
(
owned_
&&
blas_tf32_tensor_core_handle_
!=
nullptr
)
{
pten
::
dynload
::
cublasDestroy
(
blas_tf32_tensor_core_handle_
);
blas_tf32_tensor_core_handle_
=
nullptr
;
}
#endif // PADDLE_WITH_HIP
}
blasHandle_t
GetBlasHandle
()
const
{
PD_CHECK
(
blas_handle_
!=
nullptr
,
"the gpu blas handle is nullptr."
);
return
blas_handle_
;
}
void
SetBlasHandle
(
blasHandle_t
blas
)
{
blas_handle_
=
blas
;
}
void
InitDNNHandle
()
{
if
(
pten
::
dynload
::
HasCUDNN
())
{
#ifdef PADDLE_WITH_HIP
size_t
miopen_major
,
miopen_minor
,
miopen_patch
;
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenGetVersion
(
&
miopen_major
,
&
miopen_minor
,
&
miopen_patch
));
auto
local_miopen_version
=
(
miopen_major
*
1000
+
miopen_minor
*
10
+
miopen_patch
)
/
10
;
auto
compile_miopen_version
=
MIOPEN_VERSION
/
10
;
if
(
local_miopen_version
<
static_cast
<
size_t
>
(
compile_miopen_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with MIOPEN "
<<
compile_miopen_version
/
100
<<
"."
<<
compile_miopen_version
%
100
<<
", but MIOPEN version in your machine is "
<<
local_miopen_version
/
100
<<
"."
<<
local_miopen_version
%
100
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible MIOPEN "
"version."
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenCreate
(
&
dnn_handle_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
dynload
::
miopenSetStream
(
dnn_handle_
,
stream_
));
#else
auto
local_cudnn_version
=
pten
::
dynload
::
cudnnGetVersion
()
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
if
(
local_cudnn_version
<
static_cast
<
size_t
>
(
compile_cudnn_version
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with CUDNN "
<<
compile_cudnn_version
/
10
<<
"."
<<
compile_cudnn_version
%
10
<<
", but CUDNN version in your machine is "
<<
local_cudnn_version
/
10
<<
"."
<<
local_cudnn_version
%
10
<<
", which may cause serious incompatible bug. "
<<
"Please recompile or reinstall Paddle with compatible CUDNN "
"version."
;
}
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cudnnCreate
(
&
dnn_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cudnnSetStream
(
dnn_handle_
,
stream_
));
#endif
}
else
{
dnn_handle_
=
nullptr
;
}
}
dnnHandle_t
GetDnnHandle
()
{
PD_CHECK
(
dnn_handle_
!=
nullptr
,
"the gpu dnn handle is nullptr."
);
return
dnn_handle_
;
}
void
DestroyInternalDnnHandle
()
{
#ifdef PADDLE_WITH_HIP
if
(
owned_
&&
dnn_handle_
!=
nullptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
pten
::
dynload
::
miopenDestroy
(
dnn_handle_
));
dnn_handle_
=
nullptr
;
}
#else
if
(
owned_
&&
dnn_handle_
!=
nullptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
pten
::
dynload
::
cudnnDestroy
(
dnn_handle_
));
dnn_handle_
=
nullptr
;
}
#endif // PADDLE_WITH_HIP
}
void
SetDnnHandle
(
dnnHandle_t
handle
)
{
dnn_handle_
=
handle
;
}
void
InitSolverHandle
()
{
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cusolverDnCreate
(
&
solver_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
pten
::
dynload
::
cusolverDnSetStream
(
solver_handle_
,
stream_
));
#endif
}
void
DestroyInternalSolverHandle
()
{
#ifndef PADDLE_WITH_HIP
if
(
owned_
&&
solver_handle_
!=
nullptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
pten
::
dynload
::
cusolverDnDestroy
(
solver_handle_
));
solver_handle_
=
nullptr
;
}
#endif
}
solverHandle_t
GetSolverHandle
()
const
{
PD_CHECK
(
solver_handle_
!=
nullptr
,
"the gpu solver handle is nullptr."
);
return
solver_handle_
;
}
void
SetSolverHandle
(
solverHandle_t
handle
)
{
solver_handle_
=
handle
;
}
void
InitSparseHandle
()
{
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseCreate
(
&
sparse_handle_
));
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseSetStream
(
sparse_handle_
,
stream_
));
#endif
#endif
}
void
DestroyInternalSparseHandle
()
{
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
if
(
owned_
&&
sparse_handle_
!=
nullptr
)
{
PADDLE_RETRY_CUDA_SUCCESS
(
dynload
::
cusparseDestroy
(
sparse_handle_
));
sparse_handle_
=
nullptr
;
}
#endif
#endif
}
sparseHandle_t
GetSparseHandle
()
const
{
PD_CHECK
(
sparse_handle_
!=
nullptr
,
"the gpu sparse handle is nullptr."
);
return
sparse_handle_
;
}
void
SetSparseHandle
(
sparseHandle_t
handle
)
{
sparse_handle_
=
handle
;
}
void
Wait
()
const
{
#ifdef PADDLE_WITH_HIP
hipError_t
e_sync
=
hipSuccess
;
#if !defined(_WIN32)
e_sync
=
hipStreamSynchronize
(
stream_
);
#else
while
(
e_sync
=
hipStreamQuery
(
stream_
))
{
if
(
e_sync
==
hipErrorNotReady
)
continue
;
break
;
}
#endif // !defined(_WIN32)
#else // PADDLE_WITH_HIP
cudaError_t
e_sync
=
cudaSuccess
;
#if !defined(_WIN32)
e_sync
=
cudaStreamSynchronize
(
stream_
);
#else
while
(
e_sync
=
cudaStreamQuery
(
stream_
))
{
if
(
e_sync
==
cudaErrorNotReady
)
continue
;
break
;
}
#endif // !defined(_WIN32)
#endif // PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
e_sync
);
}
void
WaitEvent
(
gpuEvent_t
ev
)
const
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamWaitEvent
(
stream_
,
ev
,
0
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamWaitEvent
(
stream_
,
ev
,
0
));
#endif
}
ncclComm_t
GetNcclComm
()
const
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
return
nccl_comm_
;
#endif
return
nullptr
;
}
void
SetNcclComm
(
ncclComm_t
comm
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
nccl_comm_
=
comm
;
#endif
}
inline
void
CublasCall
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
callback
)
const
{
if
(
blas_tf32_tensor_core_handle_
!=
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
blas_tf32_mtx_
);
callback
(
blas_tf32_tensor_core_handle_
);
}
else
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
blas_mtx_
);
callback
(
blas_handle_
);
}
}
inline
void
TensorCoreCublasCallIfAvailable
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
callback
)
const
{
if
(
blas_tensor_core_handle_
!=
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
blas_tensor_core_mtx_
);
callback
(
blas_tensor_core_handle_
);
}
else
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
blas_mtx_
);
callback
(
blas_handle_
);
}
}
inline
void
CusparseCall
(
const
std
::
function
<
void
(
sparseHandle_t
)
>&
callback
)
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
sparse_mtx_
);
callback
(
sparse_handle_
);
}
void
RecordEvent
(
gpuEvent_t
ev
,
const
std
::
function
<
void
()
>&
callback
)
const
{
callback
();
RecordEvent
(
ev
);
}
void
RecordEvent
(
gpuEvent_t
ev
)
const
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
ev
,
stream_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
ev
,
stream_
));
#endif
}
void
AddStreamCallback
(
const
std
::
function
<
void
()
>&
callback
)
const
{
// TODO(wilber): Do we need ThreadPool?
auto
*
func
=
new
std
::
function
<
void
()
>
([
this
,
callback
]
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
stream_call_back_mtx_
);
last_future_
=
std
::
async
(
std
::
launch
::
deferred
,
[
&
]()
{
callback
();
});
});
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamAddCallback
(
stream_
,
internal
::
StreamCallbackFunc
,
func
,
0
));
#endif
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10000
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaLaunchHostFunc
(
stream_
,
internal
::
StreamCallbackFunc
,
func
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamAddCallback
(
stream_
,
internal
::
StreamCallbackFunc
,
func
,
0
));
#endif
#endif
}
void
WaitStreamCallback
()
const
{
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
pten
::
backends
::
gpu
::
GpuStreamSync
(
stream_
);
#endif
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
stream_call_back_mtx_
);
if
(
last_future_
.
valid
())
{
last_future_
.
wait
();
}
}
}
bool
owned_
{
false
};
Place
place_
;
int
compute_capability_
;
int
runtime_version_
;
int
driver_version_
;
int
multi_process_
;
int
max_threads_per_mp_
;
int
max_threads_per_block_
;
std
::
array
<
int
,
3
>
max_grid_dim_size_
;
gpuStream_t
stream_
{
nullptr
};
Eigen
::
GpuDevice
*
eigen_device_
{
nullptr
};
blasHandle_t
blas_handle_
{
nullptr
};
blasHandle_t
blas_tensor_core_handle_
{
nullptr
};
blasHandle_t
blas_tf32_tensor_core_handle_
{
nullptr
};
dnnHandle_t
dnn_handle_
{
nullptr
};
solverHandle_t
solver_handle_
{
nullptr
};
sparseHandle_t
sparse_handle_
{
nullptr
};
DnnWorkspaceHandle
*
workspace_
{
nullptr
};
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
// NOTE: Distributed communicator, distributed framework manages its
// resources.
ncclComm_t
nccl_comm_
{
nullptr
};
#endif
mutable
std
::
mutex
blas_mtx_
;
mutable
std
::
mutex
blas_tensor_core_mtx_
;
mutable
std
::
mutex
blas_tf32_mtx_
;
mutable
std
::
mutex
sparse_mtx_
;
mutable
std
::
mutex
stream_call_back_mtx_
;
mutable
std
::
future
<
void
>
last_future_
;
Allocator
*
allocator_
{
nullptr
};
// external resource.
// A internal resouce to initinalize eigen_device.
std
::
unique_ptr
<
internal
::
EigenGpuStreamDevice
>
eigen_stream_
{
nullptr
};
};
GPUContext
::
GPUContext
()
:
DeviceContext
(),
impl_
(
std
::
make_unique
<
Impl
>
())
{}
GPUContext
::
GPUContext
(
const
GPUPlace
&
place
)
:
DeviceContext
(),
impl_
(
std
::
make_unique
<
Impl
>
(
place
))
{}
GPUContext
::~
GPUContext
()
=
default
;
const
Place
&
GPUContext
::
GetPlace
()
const
{
return
impl_
->
GetPlace
();
}
gpuStream_t
GPUContext
::
stream
()
const
{
return
impl_
->
GetStream
();
}
dnnHandle_t
GPUContext
::
cudnn_handle
()
const
{
return
impl_
->
GetDnnHandle
();
}
blasHandle_t
GPUContext
::
cublas_handle
()
const
{
return
impl_
->
GetBlasHandle
();
}
solverHandle_t
GPUContext
::
cusolver_dn_handle
()
const
{
return
impl_
->
GetSolverHandle
();
}
sparseHandle_t
GPUContext
::
cusparse_handle
()
const
{
return
impl_
->
GetSparseHandle
();
}
void
GPUContext
::
Wait
()
const
{
impl_
->
Wait
();
}
void
GPUContext
::
WaitEvent
(
gpuEvent_t
ev
)
const
{
impl_
->
WaitEvent
(
ev
);
}
bool
GPUContext
::
tensor_core_available
()
const
{
return
impl_
->
IsTensorCoreAvailable
();
}
int
GPUContext
::
GetComputeCapability
()
const
{
return
impl_
->
compute_capability_
;
}
int
GPUContext
::
GetMaxPhysicalThreadCount
()
const
{
return
impl_
->
multi_process_
*
impl_
->
max_threads_per_mp_
;
}
int
GPUContext
::
GetSMCount
()
const
{
return
impl_
->
multi_process_
;
}
int
GPUContext
::
GetMaxThreadsPerBlock
()
const
{
return
impl_
->
max_threads_per_block_
;
}
std
::
array
<
int
,
3
>
GPUContext
::
GetCUDAMaxGridDimSize
()
const
{
return
impl_
->
max_grid_dim_size_
;
}
Eigen
::
GpuDevice
*
GPUContext
::
eigen_device
()
const
{
return
impl_
->
eigen_device
();
}
DnnWorkspaceHandle
*
GPUContext
::
cudnn_workspace_handle
()
{
return
impl_
->
GetDnnWorkspace
();
}
void
GPUContext
::
CublasCall
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
callback
)
const
{
impl_
->
CublasCall
(
callback
);
}
void
GPUContext
::
TensorCoreCublasCallIfAvailable
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
callback
)
const
{
impl_
->
TensorCoreCublasCallIfAvailable
(
callback
);
}
void
GPUContext
::
CusparseCall
(
const
std
::
function
<
void
(
sparseHandle_t
)
>&
callback
)
const
{
impl_
->
CusparseCall
(
callback
);
}
void
GPUContext
::
RecordEvent
(
gpuEvent_t
ev
,
const
std
::
function
<
void
()
>&
callback
)
const
{
impl_
->
RecordEvent
(
ev
,
callback
);
}
void
GPUContext
::
RecordEvent
(
gpuEvent_t
ev
)
const
{
impl_
->
RecordEvent
(
ev
);
}
void
GPUContext
::
AddStreamCallback
(
const
std
::
function
<
void
()
>&
callback
)
const
{
impl_
->
AddStreamCallback
(
callback
);
}
void
GPUContext
::
WaitStreamCallback
()
const
{
impl_
->
WaitStreamCallback
();
}
ncclComm_t
GPUContext
::
nccl_comm
()
const
{
return
impl_
->
GetNcclComm
();
}
void
GPUContext
::
set_nccl_comm
(
ncclComm_t
comm
)
{
impl_
->
SetNcclComm
(
comm
);
}
void
GPUContext
::
Init
()
{
impl_
->
allocator_
=
const_cast
<
Allocator
*>
(
&
this
->
GetAllocator
());
impl_
->
Init
();
}
void
GPUContext
::
SetStream
(
gpuStream_t
stream
)
{
impl_
->
SetStream
(
stream
);
}
void
GPUContext
::
SetEigenDevice
(
Eigen
::
GpuDevice
*
device
)
{
impl_
->
SetEigenDevice
(
device
);
}
void
GPUContext
::
SetBlasHandle
(
blasHandle_t
blas
)
{
impl_
->
SetBlasHandle
(
blas
);
}
void
GPUContext
::
SetDnnHandle
(
dnnHandle_t
handle
)
{
impl_
->
SetDnnHandle
(
handle
);
}
void
GPUContext
::
SetSolverHandle
(
solverHandle_t
handle
)
{
impl_
->
SetSolverHandle
(
handle
);
}
void
GPUContext
::
SetSparseHandle
(
sparseHandle_t
handle
)
{
impl_
->
SetSparseHandle
(
handle
);
}
void
GPUContext
::
SetDnnWorkspaceHandle
(
DnnWorkspaceHandle
*
handle
)
{
impl_
->
workspace_
=
handle
;
}
void
GPUContext
::
PartialInitWithoutAllocator
()
{
impl_
->
PartialInitWithoutAllocator
();
}
void
GPUContext
::
PartialInitWithAllocator
()
{
impl_
->
allocator_
=
const_cast
<
Allocator
*>
(
&
this
->
GetAllocator
());
impl_
->
PartialInitWithAllocator
();
}
void
GPUContext
::
SetComputeCapability
(
int
val
)
{
impl_
->
compute_capability_
=
val
;
}
void
GPUContext
::
SetMaxThreadsPerMultiProcessor
(
int
val
)
{
impl_
->
max_threads_per_mp_
=
val
;
}
void
GPUContext
::
SetMultiProcessors
(
int
val
)
{
impl_
->
multi_process_
=
val
;
}
void
GPUContext
::
SetMaxThreadsPerBlock
(
int
val
)
{
impl_
->
max_threads_per_block_
=
val
;
}
void
GPUContext
::
SetMaxGridDimSize
(
const
std
::
array
<
int
,
3
>&
val
)
{
impl_
->
max_grid_dim_size_
=
val
;
}
void
GPUContext
::
SetDriverVersion
(
int
val
)
{
impl_
->
driver_version_
=
val
;
}
void
GPUContext
::
SetRuntimeVersion
(
int
val
)
{
impl_
->
runtime_version_
=
val
;
}
}
// namespace pten
paddle/pten/backends/gpu/gpu_context.h
浏览文件 @
a821c4a9
/* Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -14,13 +14,162 @@ limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/device_context.h"
#include <array>
#include <functional>
#include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#include "paddle/pten/backends/gpu/gpu_helper.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/core/device_context.h"
namespace
pten
{
using
GPUContext
=
paddle
::
platform
::
CUDADeviceContext
;
}
// namespace pten
#endif
class
DnnWorkspaceHandle
;
class
GPUContext
:
public
DeviceContext
{
public:
GPUContext
();
explicit
GPUContext
(
const
GPUPlace
&
place
);
virtual
~
GPUContext
();
/*! \brief Return place in the device context. */
const
Place
&
GetPlace
()
const
override
;
/*! \brief Return gpu stream in the device context. */
gpuStream_t
stream
()
const
;
/*! \brief Return cudnn handle in the device context. */
dnnHandle_t
cudnn_handle
()
const
;
/*! \brief Return cublas handle in the device context. */
blasHandle_t
cublas_handle
()
const
;
/*! \brief Return cusolver handle in the device context. */
solverHandle_t
cusolver_dn_handle
()
const
;
/*! \brief Return cusparse handle in the device context. */
sparseHandle_t
cusparse_handle
()
const
;
/*! \brief Wait for all operations completion in the stream. */
void
Wait
()
const
override
;
/*! \brief Wait for event in the stream. */
void
WaitEvent
(
gpuEvent_t
ev
)
const
;
/*! \brief Check whether tensor core is supported */
bool
tensor_core_available
()
const
;
/*! \brief Return compute capability in the device context. */
int
GetComputeCapability
()
const
;
/*! \brief Return the max physical thread count in the device context */
int
GetMaxPhysicalThreadCount
()
const
;
/*! \brief Return the SM count in the device context */
int
GetSMCount
()
const
;
/*! \brief Return the Max thread num of block in the device context */
int
GetMaxThreadsPerBlock
()
const
;
/*! \brief Return the max grid dim size in the device context */
std
::
array
<
int
,
3
>
GetCUDAMaxGridDimSize
()
const
;
/*! \brief Return eigen device in the device context. */
Eigen
::
GpuDevice
*
eigen_device
()
const
;
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
* Once the first cudnn function is called by the handle, a lock
* would be acquired to prevent other threads from accessing the
* workspace. Once the handle is destructed, the lock would be released.
*/
DnnWorkspaceHandle
*
cudnn_workspace_handle
();
public:
/*! \brief Call cublas function safely. */
void
CublasCall
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
)
const
;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
void
TensorCoreCublasCallIfAvailable
(
const
std
::
function
<
void
(
blasHandle_t
)
>&
)
const
;
/*! \brief Call cusparse function safely. */
void
CusparseCall
(
const
std
::
function
<
void
(
sparseHandle_t
)
>&
)
const
;
void
RecordEvent
(
gpuEvent_t
ev
,
const
std
::
function
<
void
()
>&
callback
)
const
;
void
RecordEvent
(
gpuEvent_t
ev
)
const
;
void
AddStreamCallback
(
const
std
::
function
<
void
()
>&
callback
)
const
;
void
WaitStreamCallback
()
const
;
public:
/*! \brief Return nccl communicators. */
ncclComm_t
nccl_comm
()
const
;
/*! \brief Set nccl communicators. */
void
set_nccl_comm
(
ncclComm_t
comm
);
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
// Note that you must set the Allocator before calling Init function.
void
Init
();
// TODO(wilber): Why does the GetAllocator interface require a stream
// parameter?
// The temporary trick method bypasses this problem, and the following
// interfaces
// need to be deleted later.
// Note that this is a trick implementation, which can be used to partially
// initialize when the SetAllocator interface is not called.
void
PartialInitWithoutAllocator
();
// Note that this is a trick implementation that can be used to initialize
// resources that require an Allocator when the SetAllocator interface is
// called.
void
PartialInitWithAllocator
();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void
SetStream
(
gpuStream_t
);
void
SetEigenDevice
(
Eigen
::
GpuDevice
*
);
void
SetBlasHandle
(
blasHandle_t
);
void
SetDnnHandle
(
dnnHandle_t
);
void
SetSolverHandle
(
solverHandle_t
);
void
SetSparseHandle
(
sparseHandle_t
);
void
SetDnnWorkspaceHandle
(
DnnWorkspaceHandle
*
);
void
SetComputeCapability
(
int
val
);
void
SetMaxThreadsPerMultiProcessor
(
int
val
);
void
SetMultiProcessors
(
int
val
);
void
SetMaxThreadsPerBlock
(
int
val
);
void
SetMaxGridDimSize
(
const
std
::
array
<
int
,
3
>&
val
);
void
SetDriverVersion
(
int
val
);
void
SetRuntimeVersion
(
int
val
);
private:
struct
Impl
;
std
::
unique_ptr
<
Impl
>
impl_
;
};
}
// namespace pten
paddle/pten/backends/gpu/gpu_decls.h
0 → 100644
浏览文件 @
a821c4a9
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
namespace
pten
{
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU
(
gpuStream_t
,
cudaStream_t
,
hipStream_t
);
DECLARE_TYPE_FOR_GPU
(
gpuEvent_t
,
cudaEvent_t
,
hipEvent_t
);
DECLARE_TYPE_FOR_GPU
(
dnnActivationDescriptor
,
cudnnActivationStruct
,
miopenActivationDescriptor
);
DECLARE_TYPE_FOR_GPU
(
dnnTensorDescriptor
,
cudnnTensorStruct
,
miopenTensorDescriptor
);
DECLARE_TYPE_FOR_GPU
(
dnnFilterDescriptor
,
cudnnFilterStruct
,
miopenTensorDescriptor
);
DECLARE_TYPE_FOR_GPU
(
dnnFilterDescriptor_t
,
cudnnFilterDescriptor_t
,
miopenTensorDescriptor_t
);
DECLARE_TYPE_FOR_GPU
(
dnnConvolutionDescriptor
,
cudnnConvolutionStruct
,
miopenConvolutionDescriptor
);
DECLARE_TYPE_FOR_GPU
(
dnnConvolutionDescriptor_t
,
cudnnConvolutionDescriptor_t
,
miopenConvolutionDescriptor_t
);
DECLARE_TYPE_FOR_GPU
(
dnnPoolingDescriptor_t
,
cudnnPoolingDescriptor_t
,
miopenPoolingDescriptor_t
);
DECLARE_TYPE_FOR_GPU
(
dnnDropoutDescriptor_t
,
cudnnDropoutDescriptor_t
,
miopenDropoutDescriptor_t
);
DECLARE_TYPE_FOR_GPU
(
dnnHandle_t
,
cudnnHandle_t
,
miopenHandle_t
);
DECLARE_TYPE_FOR_GPU
(
blasHandle_t
,
cublasHandle_t
,
rocblas_handle
);
DECLARE_TYPE_FOR_GPU
(
solverHandle_t
,
cusolverDnHandle_t
,
rocsolver_handle
);
DECLARE_TYPE_FOR_GPU
(
sparseHandle_t
,
cusparseHandle_t
,
rocsparse_handle
);
#undef DECLARE_TYPE_FOR_GPU
using
CUDAGraphID
=
unsigned
long
long
;
// NOLINT
}
// namespace pten
paddle/pten/backends/gpu/gpu_helper.h
0 → 100644
浏览文件 @
a821c4a9
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/gpu/rocm/rocm_helper.h"
#else
#include "paddle/pten/backends/gpu/cuda/cuda_helper.h"
#endif
#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
#endif
paddle/pten/backends/gpu/gpu_info.cc
0 → 100644
浏览文件 @
a821c4a9
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/pten/backends/gpu/gpu_info.h"
#include <vector>
#include "gflags/gflags.h"
DECLARE_string
(
selected_gpus
);
namespace
pten
{
namespace
backends
{
namespace
gpu
{
static
inline
std
::
vector
<
std
::
string
>
Split
(
std
::
string
const
&
original
,
char
separator
)
{
std
::
vector
<
std
::
string
>
results
;
std
::
string
token
;
std
::
istringstream
is
(
original
);
while
(
std
::
getline
(
is
,
token
,
separator
))
{
if
(
!
token
.
empty
())
{
results
.
push_back
(
token
);
}
}
return
results
;
}
//! Get a list of device ids from environment variable or use all.
std
::
vector
<
int
>
GetSelectedDevices
()
{
// use user specified GPUs in single-node multi-process mode.
std
::
vector
<
int
>
devices
;
if
(
!
FLAGS_selected_gpus
.
empty
())
{
auto
devices_str
=
Split
(
FLAGS_selected_gpus
,
','
);
for
(
auto
id
:
devices_str
)
{
devices
.
push_back
(
atoi
(
id
.
c_str
()));
}
}
else
{
int
count
=
GetGPUDeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
devices
.
push_back
(
i
);
}
}
return
devices
;
}
}
// namespace gpu
}
// namespace backends
}
// namespace pten
paddle/pten/backends/gpu/gpu_info.h
0 → 100644
浏览文件 @
a821c4a9
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <stddef.h>
#include <array>
#include <string>
#include <vector>
#include "paddle/pten/backends/gpu/gpu_types.h"
namespace
pten
{
namespace
backends
{
namespace
gpu
{
//! Get the version of dnn
int
DnnVersion
();
//! Get the total number of GPU devices in system.
int
GetGPUDeviceCount
();
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
int
GetGPUComputeCapability
(
int
id
);
//! Get the runtime version of the ith GPU
int
GetGPURuntimeVersion
(
int
id
);
//! Get the driver version of the ith GPU
int
GetGPUDriverVersion
(
int
id
);
//! Wheter the current device support TensorCore
bool
TensorCoreAvailable
();
//! Get the MultiProcessors of the ith GPU.
int
GetGPUMultiProcessors
(
int
id
);
//! Get the MaxThreads of each MultiProcessor of the ith GPU.
int
GetGPUMaxThreadsPerMultiProcessor
(
int
id
);
//! Get the MaxThreads of each block of the ith GPU.
int
GetGPUMaxThreadsPerBlock
(
int
id
);
//! Get the current GPU device id in system.
int
GetCurrentDeviceId
();
//! Get the maximum GridDim size for GPU buddy allocator.
std
::
array
<
int
,
3
>
GetGpuMaxGridDimSize
(
int
);
//! Get a list of device ids from environment variable or use all.
std
::
vector
<
int
>
GetSelectedDevices
();
//! Get the properties of the ith GPU device.
const
gpuDeviceProp
&
GetDeviceProperties
(
int
id
);
//! Set the GPU device id for next execution.
void
SetDeviceId
(
int
device_id
);
//! Copy memory from address src to dst asynchronously.
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
,
gpuStream_t
stream
);
//! Copy memory from address src to dst synchronously.
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
);
//! Copy memory from one device to another device asynchronously.
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
,
gpuStream_t
stream
);
//! Copy memory from one device to another device synchronously.
void
GpuMemcpyPeerSync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
);
//! Set memory dst with value count size asynchronously
void
GpuMemsetAsync
(
void
*
dst
,
int
value
,
size_t
count
,
gpuStream_t
stream
);
//! Blocks until stream has completed all operations.
void
GpuStreamSync
(
gpuStream_t
stream
);
void
GpuDestroyStream
(
gpuStream_t
stream
);
// ! Blocks until device has completed all operations.
void
GpuDeviceSync
();
gpuError_t
GpuGetLastError
();
class
GPUDeviceGuard
{
public:
explicit
inline
GPUDeviceGuard
(
int
dev_id
)
{
int
prev_id
=
GetCurrentDeviceId
();
if
(
prev_id
!=
dev_id
)
{
prev_id_
=
prev_id
;
SetDeviceId
(
dev_id
);
}
}
inline
~
GPUDeviceGuard
()
{
if
(
prev_id_
!=
-
1
)
{
SetDeviceId
(
prev_id_
);
}
}
GPUDeviceGuard
(
const
GPUDeviceGuard
&
o
)
=
delete
;
GPUDeviceGuard
&
operator
=
(
const
GPUDeviceGuard
&
o
)
=
delete
;
private:
int
prev_id_
{
-
1
};
};
}
// namespace gpu
}
// namespace backends
}
// namespace pten
#endif
paddle/pten/backends/gpu/gpu_launch_config.h
0 → 100644
浏览文件 @
a821c4a9
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Used for compute gpu launch parameter config
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
#include <stddef.h>
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/pten/backends/gpu/gpu_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace
pten
{
namespace
backends
{
namespace
gpu
{
inline
int
DivUp
(
int
a
,
int
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
static
inline
int
RoundToPowerOfTwo
(
int
n
)
{
n
--
;
n
|=
(
n
>>
1
);
n
|=
(
n
>>
2
);
n
|=
(
n
>>
4
);
n
|=
(
n
>>
8
);
n
|=
(
n
>>
16
);
#ifdef __HIPCC__
return
std
::
min
(
256
,
std
::
max
(
32
,
(
n
+
1
)));
#else
return
std
::
min
(
1024
,
std
::
max
(
32
,
(
n
+
1
)));
#endif
}
#ifdef WITH_NV_JETSON
// The number of threads cannot be assigned 1024 in some cases when the device
// is nano or tx2 .
inline
void
ChangeThreadNum
(
const
pten
::
CUDAContext
&
context
,
int
*
num_thread
,
int
alternative_num_thread
=
512
)
{
if
(
context
.
GetComputeCapability
()
==
53
||
context
.
GetComputeCapability
()
==
62
)
{
*
num_thread
=
alternative_num_thread
;
}
}
#endif
struct
GpuLaunchConfig
{
public:
GpuLaunchConfig
()
{}
size_t
GetThreadNum
()
const
{
return
GetBlockSize
()
*
GetGridSize
();
}
size_t
GetGridSize
()
const
{
return
block_per_grid
.
x
*
block_per_grid
.
y
*
block_per_grid
.
z
;
}
size_t
GetBlockSize
()
const
{
return
thread_per_block
.
x
*
thread_per_block
.
y
*
thread_per_block
.
z
;
}
int
compute_capability
=
0
;
dim3
thread_per_block
=
dim3
(
1
,
1
,
1
);
dim3
block_per_grid
=
dim3
(
1
,
1
,
1
);
};
/* According to NVIDIA, if number of threads per block is 64/128/256/512,
* cuda performs better. And number of blocks should be greater (at least
* 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block. */
inline
GpuLaunchConfig
GetGpuLaunchConfig1D
(
const
pten
::
GPUContext
&
context
,
int64_t
numel
,
int
vec_size
=
1
)
{
PADDLE_ENFORCE_GT
(
numel
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"element quantity should be greater than 0,"
" but received value is: %d."
,
numel
));
// Get compute_capability
const
int
capability
=
context
.
GetComputeCapability
();
/* If thread number per block is 64/128/256/512, cuda performs better.*/
int
limit_threads
=
std
::
min
(
PREDEFINED_BLOCK_SIZE
,
context
.
GetMaxThreadsPerBlock
());
#ifdef WITH_NV_JETSON
if
(
capability
==
53
||
capability
==
62
)
{
limit_threads
=
512
;
}
#endif
int
threads
=
limit_threads
;
int
sm_count
=
context
.
GetSMCount
();
int
active_threads_num
=
numel
/
vec_size
;
if
(
active_threads_num
/
(
sm_count
<<
1
)
<
limit_threads
)
{
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
threads
=
RoundToPowerOfTwo
(
active_threads_num
/
(
sm_count
<<
1
));
}
else
if
(
active_threads_num
/
(
sm_count
<<
2
)
<
limit_threads
)
{
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about 4 times of SM, to acquire better performance.
threads
=
RoundToPowerOfTwo
(
active_threads_num
/
(
sm_count
<<
2
));
}
// Number of threads per block shall be larger than 64.
threads
=
std
::
max
(
64
,
threads
);
int
blocks
=
DivUp
(
DivUp
(
numel
,
vec_size
),
threads
);
GpuLaunchConfig
config
;
config
.
thread_per_block
.
x
=
threads
;
config
.
block_per_grid
.
x
=
blocks
;
config
.
compute_capability
=
capability
;
return
config
;
}
inline
GpuLaunchConfig
GetGpuLaunchConfig2D
(
const
pten
::
GPUContext
&
context
,
int
x_dim
,
int
y_dim
)
{
PADDLE_ENFORCE_GT
(
x_dim
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"x dim number should greater than 0,"
" but received value is: %d"
,
x_dim
));
PADDLE_ENFORCE_GT
(
y_dim
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"y dim number should greater than 0,"
" but received value is: %d"
,
y_dim
));
const
int
kThreadsPerBlock
=
256
;
int
block_cols
=
(
std
::
min
)(
x_dim
,
kThreadsPerBlock
);
int
block_rows
=
(
std
::
max
)(
kThreadsPerBlock
/
block_cols
,
1
);
int
max_physical_threads
=
context
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
(
std
::
max
)(
max_physical_threads
/
kThreadsPerBlock
,
1
);
GpuLaunchConfig
config
;
// Noticed, block size is not align to 32, if needed do it yourself.
config
.
thread_per_block
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_x
=
(
std
::
min
)(
DivUp
(
x_dim
,
block_cols
),
max_blocks
);
int
grid_y
=
(
std
::
min
)(
max_blocks
/
grid_x
,
(
std
::
max
)(
y_dim
/
block_rows
,
1
));
config
.
block_per_grid
=
dim3
(
grid_x
,
grid_y
,
1
);
return
config
;
}
}
// namespace gpu
}
// namespace backends
}
// namespace pten
#endif
paddle/pten/backends/gpu/gpu_types.h
0 → 100644
浏览文件 @
a821c4a9
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/backends/gpu/forwards.h"
#include "paddle/pten/backends/gpu/gpu_decls.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
#include "paddle/pten/backends/dynload/miopen.h"
#include "paddle/pten/backends/dynload/rocblas.h"
#else // PADDLE_WITH_CUDA
#include "paddle/pten/backends/dynload/cublas.h"
#include "paddle/pten/backends/dynload/cudnn.h"
#endif
namespace
pten
{
#ifdef PADDLE_WITH_HIP
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = ROCM_TYPE;
#else // PADDLE_WITH_CDUA
#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
using GPU_TYPE = CUDA_TYPE;
#endif
DECLARE_TYPE_FOR_GPU
(
gpuError_t
,
cudaError_t
,
hipError_t
);
DECLARE_TYPE_FOR_GPU
(
gpuMemcpyKind
,
cudaMemcpyKind
,
hipMemcpyKind
);
DECLARE_TYPE_FOR_GPU
(
gpuDeviceProp
,
cudaDeviceProp
,
hipDeviceProp_t
);
DECLARE_TYPE_FOR_GPU
(
dnnDataType_t
,
cudnnDataType_t
,
miopenDataType_t
);
DECLARE_TYPE_FOR_GPU
(
dnnPoolingMode_t
,
cudnnPoolingMode_t
,
miopenPoolingMode_t
);
DECLARE_TYPE_FOR_GPU
(
dnnTensorFormat_t
,
cudnnTensorFormat_t
,
miopenTensorFormat_t
);
DECLARE_TYPE_FOR_GPU
(
dnnActivationMode_t
,
cudnnActivationMode_t
,
miopenActivationMode_t
);
#undef DECLARE_TYPE_FOR_GPU
#ifdef PADDLE_WITH_HIP
#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
constexpr auto GPU_CV = ROCM_CV;
#else // PADDLE_WITH_CUDA
#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
constexpr auto GPU_CV = CUDA_CV;
#endif
DECLARE_CONSTANT_FOR_GPU
(
gpuErrorOutOfMemory
,
cudaErrorMemoryAllocation
,
hipErrorOutOfMemory
);
DECLARE_CONSTANT_FOR_GPU
(
gpuErrorNotReady
,
cudaErrorNotReady
,
hipErrorNotReady
);
DECLARE_CONSTANT_FOR_GPU
(
gpuSuccess
,
cudaSuccess
,
hipSuccess
);
#undef DECLARE_CONSTANT_FOR_GPU
}
// namespace pten
#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle/pten/backends/gpu/rocm/CMakeLists.txt
0 → 100644
浏览文件 @
a821c4a9
hip_library
(
pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda
)
paddle/pten/backends/gpu/rocm/rocm_helper.h
0 → 100644
浏览文件 @
a821c4a9
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace
pten
{
namespace
backends
{
namespace
gpu
{
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
}
// namespace gpu
}
// namespace backends
}
// namespace pten
paddle/
fluid/platform/device
/gpu/rocm/rocm_info.cc
→
paddle/
pten/backends
/gpu/rocm/rocm_info.cc
浏览文件 @
a821c4a9
...
...
@@ -12,20 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include "paddle/pten/backends/gpu/gpu_info.h"
// TODO(pten): remove fluid headers.
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
static
std
::
once_flag
g_device_props_size_init_flag
;
static
std
::
vector
<
std
::
unique_ptr
<
std
::
once_flag
>>
g_device_props_init_flags
;
static
std
::
vector
<
paddle
::
gpuDeviceProp
>
g_device_props
;
namespace
paddle
{
namespace
platform
{
namespace
pten
{
namespace
backends
{
namespace
gpu
{
int
DnnVersion
()
{
if
(
!
dynload
::
HasCUDNN
())
return
-
1
;
size_t
version_major
,
version_minor
,
version_patch
;
...
...
@@ -78,11 +78,13 @@ int GetGPUDeviceCount() {
}
int
GetGPUComputeCapability
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
major
,
minor
;
auto
major_error_code
=
hipDeviceGetAttribute
(
&
major
,
hipDeviceAttributeComputeCapabilityMajor
,
id
);
...
...
@@ -95,22 +97,26 @@ int GetGPUComputeCapability(int id) {
}
int
GetGPURuntimeVersion
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
runtime_version
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
hipRuntimeGetVersion
(
&
runtime_version
));
return
runtime_version
;
}
int
GetGPUDriverVersion
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
driver_version
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDriverGetVersion
(
&
driver_version
));
return
driver_version
;
...
...
@@ -119,11 +125,13 @@ int GetGPUDriverVersion(int id) {
bool
TensorCoreAvailable
()
{
return
false
;
}
int
GetGPUMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
count
;
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceGetAttribute
(
&
count
,
hipDeviceAttributeMultiprocessorCount
,
id
));
...
...
@@ -131,11 +139,13 @@ int GetGPUMultiProcessors(int id) {
}
int
GetGPUMaxThreadsPerMultiProcessor
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
count
;
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceGetAttribute
(
&
count
,
hipDeviceAttributeMaxThreadsPerMultiProcessor
,
id
));
...
...
@@ -144,11 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) {
}
int
GetGPUMaxThreadsPerBlock
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
int
count
;
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceGetAttribute
(
&
count
,
hipDeviceAttributeMaxThreadsPerBlock
,
id
));
...
...
@@ -161,35 +173,37 @@ int GetCurrentDeviceId() {
return
device_id
;
}
dim3
GetGpuMaxGridDimSize
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
std
::
array
<
int
,
3
>
GetGpuMaxGridDimSize
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
dim3
ret
;
id
,
GetGPUDeviceCount
()));
std
::
array
<
int
,
3
>
ret
;
int
size
;
auto
error_code_x
=
hipDeviceGetAttribute
(
&
size
,
hipDeviceAttributeMaxGridDimX
,
id
);
PADDLE_ENFORCE_GPU_SUCCESS
(
error_code_x
);
ret
.
x
=
size
;
ret
[
0
]
=
size
;
auto
error_code_y
=
hipDeviceGetAttribute
(
&
size
,
hipDeviceAttributeMaxGridDimY
,
id
);
PADDLE_ENFORCE_GPU_SUCCESS
(
error_code_y
);
ret
.
y
=
size
;
ret
[
1
]
=
size
;
auto
error_code_z
=
hipDeviceGetAttribute
(
&
size
,
hipDeviceAttributeMaxGridDimZ
,
id
);
PADDLE_ENFORCE_GPU_SUCCESS
(
error_code_z
);
ret
.
z
=
size
;
ret
[
2
]
=
size
;
return
ret
;
}
const
gpuDeviceProp
&
GetDeviceProperties
(
int
id
)
{
std
::
call_once
(
g_device_props_size_init_flag
,
[
&
]
{
int
gpu_num
=
0
;
gpu_num
=
platform
::
GetGPUDeviceCount
();
gpu_num
=
GetGPUDeviceCount
();
g_device_props_init_flags
.
resize
(
gpu_num
);
g_device_props
.
resize
(
gpu_num
);
for
(
int
i
=
0
;
i
<
gpu_num
;
++
i
)
{
...
...
@@ -198,16 +212,17 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
});
if
(
id
==
-
1
)
{
id
=
platform
::
GetCurrentDeviceId
();
id
=
GetCurrentDeviceId
();
}
if
(
id
<
0
||
id
>=
static_cast
<
int
>
(
g_device_props
.
size
()))
{
PADDLE_THROW
(
platform
::
errors
::
OutOfRange
(
PADDLE_THROW
(
p
addle
::
p
latform
::
errors
::
OutOfRange
(
"The device id %d is out of range [0, %d), where %d is the number of "
"devices on this machine. Because the device id should be greater than "
"or equal to zero and smaller than the number of gpus. Please input "
"appropriate device again!"
,
id
,
static_cast
<
int
>
(
g_device_props
.
size
()),
id
,
static_cast
<
int
>
(
g_device_props
.
size
()),
static_cast
<
int
>
(
g_device_props
.
size
())));
}
...
...
@@ -220,32 +235,43 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
void
SetDeviceId
(
int
id
)
{
// TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
id
,
GetGPUDeviceCount
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d."
,
id
,
GetGPUDeviceCount
()));
id
,
GetGPUDeviceCount
()));
PADDLE_RETRY_CUDA_SUCCESS
(
hipSetDevice
(
id
));
}
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
,
gpuStream_t
stream
)
{
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
,
gpuStream_t
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
));
}
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
gpuMemcpyKind
kind
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpy
(
dst
,
src
,
count
,
kind
));
}
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
,
gpuStream_t
stream
)
{
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
,
gpuStream_t
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpyPeerAsync
(
dst
,
dst_device
,
src
,
src_device
,
count
,
stream
));
}
void
GpuMemcpyPeerSync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
)
{
void
GpuMemcpyPeerSync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
int
src_device
,
size_t
count
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpyPeer
(
dst
,
dst_device
,
src
,
src_device
,
count
));
}
...
...
@@ -265,5 +291,7 @@ void GpuDestroyStream(gpuStream_t stream) {
void
GpuDeviceSync
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
}
gpuError_t
GpuGetLastError
()
{
return
hipGetLastError
();
}
}
// namespace platform
}
// namespace paddle
}
// namespace gpu
}
// namespace backends
}
// namespace pten
paddle/pten/backends/xpu/xpu_context.cc
浏览文件 @
a821c4a9
...
...
@@ -13,8 +13,11 @@
// limitations under the License.
#include "paddle/pten/backends/xpu/xpu_context.h"
#include <memory>
#include "paddle/pten/api/ext/exception.h"
#include "paddle/pten/common/place.h"
#include "xpu/runtime.h"
#include "xpu/runtime_ex.h"
...
...
@@ -24,12 +27,11 @@ namespace xpu = baidu::xpu::api;
namespace
pten
{
struct
XPUContext
::
XPU
Impl
{
void
SetL3Cache
()
{
struct
XPUContext
::
Impl
{
void
SetL3Cache
(
int
l3_size
=
14155776
)
{
const
int
MAX_XPU_NUM
=
16
;
static
void
*
l3ptrs
[
MAX_XPU_NUM
]
=
{
nullptr
};
int
l3_size
=
13.5
*
1024
*
1024
;
if
(
std
::
getenv
(
"XPU_PADDLE_L3_SIZE"
)
!=
nullptr
)
{
l3_size
=
atoi
(
std
::
getenv
(
"XPU_PADDLE_L3_SIZE"
));
}
...
...
@@ -52,48 +54,28 @@ struct XPUContext::XPUImpl {
}
}
XPUImpl
()
{
context_
=
xpu
::
create_context
();
xpu_version_
=
backends
::
xpu
::
get_xpu_version
(
place_
.
device
);
}
explicit
XPUImpl
(
XPUPlace
place
)
:
place_
(
place
)
{
backends
::
xpu
::
XPUDeviceGuard
guard
(
place_
.
GetDeviceId
());
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Please NOTE: xpu device: "
<<
static_cast
<
int
>
(
place_
.
device
);
context_
=
xpu
::
create_context
();
xpu_version_
=
backends
::
xpu
::
get_xpu_version
(
place_
.
device
);
SetL3Cache
();
}
Impl
()
:
place_
(
XPUPlace
())
{}
// Users need to manage external resources.
explicit
XPUImpl
(
const
XPUContextResource
&
ctx_res
,
const
XPUPlace
&
place
=
XPUPlace
(
0
))
:
res_
(
ctx_res
),
place_
(
place
)
{
context_
=
res_
.
context
;
xpu_version_
=
backends
::
xpu
::
get_xpu_version
(
place_
.
device
);
SetL3Cache
();
}
explicit
Impl
(
const
Place
&
place
)
:
place_
(
place
)
{}
~
XPU
Impl
()
{
if
(
res_
.
context
==
nullptr
&&
context_
!=
nullptr
)
{
~
Impl
()
{
if
(
owned_
&&
context_
!=
nullptr
)
{
xpu
::
destroy_context
(
context_
);
context_
=
nullptr
;
}
}
Place
GetPlace
()
const
{
return
place_
;
}
backends
::
xpu
::
XPUVersion
GetXpuVersion
()
const
{
return
xpu_version_
;
}
const
Place
&
GetPlace
()
const
{
return
place_
;
}
xpu
::
Context
*
GetXContext
()
const
{
PD_CHECK
(
context_
!=
nullptr
,
"the xpu context is nullptr."
);
return
context_
;
}
xpu
::
BKCLContext_t
GetBkclContext
()
const
{
return
bkcl_context_
;
}
xpu
::
BKCLContext_t
GetBkclContext
()
const
{
PD_CHECK
(
bkcl_context_
!=
nullptr
,
"the xpu bkcl_context is nullptr."
);
return
bkcl_context_
;
}
void
Wait
()
const
{
backends
::
xpu
::
SetXPUDeviceId
(
place_
.
GetDeviceId
());
...
...
@@ -101,53 +83,41 @@ struct XPUContext::XPUImpl {
xpu_wait
(
context_
->
xpu_stream
);
}
void
SetXContext
(
xpu
::
Context
*
context
)
{
if
(
context
==
nullptr
)
{
return
;
}
res_
.
context
=
context
;
context_
=
context
;
void
Init
()
{
owned_
=
true
;
backends
::
xpu
::
XPUDeviceGuard
guard
(
place_
.
GetDeviceId
());
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Please NOTE: xpu device: "
<<
static_cast
<
int
>
(
place_
.
device
);
context_
=
xpu
::
create_context
();
xpu_version_
=
backends
::
xpu
::
get_xpu_version
(
place_
.
device
);
SetL3Cache
();
}
void
SetXContext
(
xpu
::
Context
*
context
)
{
context_
=
context
;
}
void
SetBkclContext
(
xpu
::
BKCLContext_t
context
)
{
bkcl_context_
=
context
;
}
XPUContextResource
res_
;
XPU
Place
place_
;
bool
owned_
{
false
}
;
Place
place_
;
backends
::
xpu
::
XPUVersion
xpu_version_
;
xpu
::
Context
*
context_
{
nullptr
};
// NOTE: Distributed communicator, distributed framework manages its
// resources, XPUContext only holds references.
xpu
::
BKCLContext_t
bkcl_context_
{
nullptr
};
};
XPUContext
::
XPUContext
()
:
DeviceContext
()
{
impl_
=
std
::
make_unique
<
XPUImpl
>
();
}
XPUContext
::
XPUContext
()
:
DeviceContext
(),
impl_
(
std
::
make_unique
<
Impl
>
())
{}
XPUContext
::
XPUContext
(
const
XPUPlace
&
place
)
{
impl_
=
std
::
make_unique
<
XPUImpl
>
(
place
);
}
XPUContext
::
XPUContext
(
const
XPUContext
&
other
)
:
DeviceContext
()
{
impl_
=
std
::
make_unique
<
XPUImpl
>
();
impl_
->
SetXContext
(
other
.
x_context
());
impl_
->
SetBkclContext
(
other
.
bkcl_context
());
}
XPUContext
::
XPUContext
(
XPUContext
&&
other
)
:
DeviceContext
()
{
impl_
=
std
::
move
(
other
.
impl_
);
}
XPUContext
::
XPUContext
(
const
XPUPlace
&
place
)
:
DeviceContext
(),
impl_
(
std
::
make_unique
<
Impl
>
(
place
))
{}
XPUContext
::~
XPUContext
()
=
default
;
XPUContext
::
XPUContext
(
const
XPUContextResource
&
ctx_res
)
:
DeviceContext
()
{
impl_
=
std
::
make_unique
<
XPUImpl
>
(
ctx_res
);
}
Place
XPUContext
::
GetPlace
()
const
{
return
impl_
->
GetPlace
();
}
const
Place
&
XPUContext
::
GetPlace
()
const
{
return
impl_
->
GetPlace
();
}
backends
::
xpu
::
XPUVersion
XPUContext
::
xpu_version
()
const
{
return
impl_
->
GetXpuVersion
()
;
return
impl_
->
xpu_version_
;
}
xpu
::
Context
*
XPUContext
::
x_context
()
const
{
return
impl_
->
GetXContext
();
}
...
...
@@ -158,12 +128,16 @@ xpu::BKCLContext_t XPUContext::bkcl_context() const {
void
XPUContext
::
Wait
()
const
{
impl_
->
Wait
();
}
void
XPUContext
::
set_x_c
ontext
(
xpu
::
Context
*
context
)
{
void
XPUContext
::
SetXC
ontext
(
xpu
::
Context
*
context
)
{
impl_
->
SetXContext
(
context
);
}
void
XPUContext
::
set_bkcl_context
(
xpu
::
BKCLContext_t
context
)
{
void
XPUContext
::
SetL3Cache
(
int
l3_size
)
{
impl_
->
SetL3Cache
(
l3_size
);
}
void
XPUContext
::
SetBkclContext
(
xpu
::
BKCLContext_t
context
)
{
impl_
->
SetBkclContext
(
context
);
}
void
XPUContext
::
Init
()
{
impl_
->
Init
();
}
}
// namespace pten
paddle/pten/backends/xpu/xpu_context.h
浏览文件 @
a821c4a9
...
...
@@ -26,26 +26,15 @@ namespace xpu = baidu::xpu::api;
namespace
pten
{
struct
XPUContextResource
{
xpu
::
Context
*
context
{
nullptr
};
};
class
XPUContext
:
public
DeviceContext
{
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
XPUContext
();
explicit
XPUContext
(
const
XPUPlace
&
);
// NOTE: Share the same underlying resources, please ensure that resources are
// not released.
XPUContext
(
const
XPUContext
&
);
XPUContext
(
XPUContext
&&
);
virtual
~
XPUContext
();
Place
GetPlace
()
const
override
;
const
Place
&
GetPlace
()
const
override
;
backends
::
xpu
::
XPUVersion
xpu_version
()
const
;
...
...
@@ -53,21 +42,28 @@ class XPUContext : public DeviceContext {
// Return bkcl context.
xpu
::
BKCLContext_t
bkcl_context
()
const
;
void
SetBkclContext
(
xpu
::
BKCLContext_t
context
);
// Wait for all operations completion in the stream.
void
Wait
()
const
override
;
public:
// NOTE: External users manage resources. Used in inference scenarios.
explicit
XPUContext
(
const
XPUContextResource
&
);
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void
Init
();
void
set_x_context
(
xpu
::
Context
*
);
public:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
// resource as external, and will not delete any resource when destructing.
void
SetXContext
(
xpu
::
Context
*
);
void
set_bkcl_context
(
xpu
::
BKCLContext_t
context
);
void
SetL3Cache
(
int
l3_size
=
14155776
);
private:
struct
XPU
Impl
;
std
::
unique_ptr
<
XPU
Impl
>
impl_
;
struct
Impl
;
std
::
unique_ptr
<
Impl
>
impl_
;
};
}
// namespace pten
paddle/pten/core/device_context.cc
浏览文件 @
a821c4a9
...
...
@@ -23,7 +23,7 @@ struct DeviceContext::Impl {
Impl
()
=
default
;
~
Impl
()
=
default
;
void
Set
Device
Allocator
(
const
Allocator
*
allocator
)
{
void
SetAllocator
(
const
Allocator
*
allocator
)
{
PADDLE_ENFORCE_NOT_NULL
(
allocator
,
pten
::
errors
::
InvalidArgument
(
...
...
@@ -47,7 +47,7 @@ struct DeviceContext::Impl {
zero_allocator_
=
allocator
;
}
const
Allocator
&
Get
Device
Allocator
()
const
{
const
Allocator
&
GetAllocator
()
const
{
PADDLE_ENFORCE_NOT_NULL
(
device_allocator_
,
pten
::
errors
::
InvalidArgument
(
"Required device_allocator_ shall not be "
...
...
@@ -124,7 +124,7 @@ DeviceContext::DeviceContext() { impl_ = std::make_unique<Impl>(); }
DeviceContext
::
DeviceContext
(
const
DeviceContext
&
other
)
{
impl_
->
SetHostAllocator
(
&
other
.
GetHostAllocator
());
impl_
->
Set
DeviceAllocator
(
&
other
.
GetDevice
Allocator
());
impl_
->
Set
Allocator
(
&
other
.
Get
Allocator
());
impl_
->
SetZeroAllocator
(
&
other
.
GetZeroAllocator
());
}
...
...
@@ -134,12 +134,12 @@ DeviceContext::DeviceContext(DeviceContext&& other) {
DeviceContext
::~
DeviceContext
()
=
default
;
void
DeviceContext
::
Set
Device
Allocator
(
const
Allocator
*
allocator
)
{
impl_
->
Set
Device
Allocator
(
allocator
);
void
DeviceContext
::
SetAllocator
(
const
Allocator
*
allocator
)
{
impl_
->
SetAllocator
(
allocator
);
}
const
Allocator
&
DeviceContext
::
Get
Device
Allocator
()
const
{
return
impl_
->
Get
Device
Allocator
();
const
Allocator
&
DeviceContext
::
GetAllocator
()
const
{
return
impl_
->
GetAllocator
();
}
void
DeviceContext
::
SetHostAllocator
(
const
Allocator
*
allocator
)
{
...
...
paddle/pten/core/device_context.h
浏览文件 @
a821c4a9
...
...
@@ -60,7 +60,7 @@ class DeviceContext {
*
* @param allocator
*/
void
Set
Device
Allocator
(
const
Allocator
*
);
void
SetAllocator
(
const
Allocator
*
);
/**
* @brief Set the host Allocator object.
...
...
@@ -81,7 +81,7 @@ class DeviceContext {
*
* @return Allocator
*/
const
Allocator
&
Get
Device
Allocator
()
const
;
const
Allocator
&
GetAllocator
()
const
;
/**
* @brief Get the const device-related Allocator object.
...
...
@@ -114,7 +114,7 @@ class DeviceContext {
// TODO(wilber): Just for the convenience of migrating the code, it will be
// modified or removed later.
virtual
Place
GetPlace
()
const
=
0
;
virtual
const
Place
&
GetPlace
()
const
=
0
;
// TODO(wilber): The fluid framework uses wait() in many places, how to delete
// this API interface.
virtual
void
Wait
()
const
{}
...
...
paddle/pten/kernels/funcs/cuda_kernel_config.h
浏览文件 @
a821c4a9
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#ifdef __HIPCC__
#define ELEMENTWISE_BLOCK_SIZE 256
...
...
@@ -31,7 +32,7 @@ namespace funcs {
* 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block.
*/
inline
int
GetThreadsConfig
(
const
p
addle
::
platform
::
CUDADevice
Context
&
ctx
,
inline
int
GetThreadsConfig
(
const
p
ten
::
GPU
Context
&
ctx
,
int64_t
numel
,
int
vec_size
)
{
int
threads
=
ELEMENTWISE_BLOCK_SIZE
;
...
...
paddle/pten/kernels/funcs/elementwise_base.h
浏览文件 @
a821c4a9
...
...
@@ -23,8 +23,8 @@ limitations under the License. */
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/backends/gpu/gpu_launch_config.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace
kps
=
pten
::
kps
;
...
...
@@ -646,7 +646,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
VecSize
><<<
grid_size
,
block_size
,
0
,
stream
>>>
(
ins_data
,
outs_data
,
numel
,
main_offset
,
func
);
#else
auto
gpu_config
=
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
auto
gpu_config
=
pten
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
int
main_offset
=
(
numel
/
(
VecSize
*
gpu_config
.
GetBlockSize
()))
*
VecSize
*
gpu_config
.
GetBlockSize
();
auto
stream
=
ctx
.
stream
();
...
...
paddle/pten/kernels/funcs/transpose.cu
浏览文件 @
a821c4a9
...
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/ddim.h"
...
...
paddle/pten/kernels/gpu/concat_and_split.h
浏览文件 @
a821c4a9
...
...
@@ -237,8 +237,7 @@ __global__ void SplitKernel(const T* input_data,
SplitKernelDetail
<
T
>
(
input_data
,
in_row
,
in_col
,
fixed_out_col
,
outputs_data
);
}
static
inline
void
GetBlockDims
(
const
paddle
::
platform
::
CUDADeviceContext
&
context
,
static
inline
void
GetBlockDims
(
const
pten
::
GPUContext
&
context
,
int64_t
num_rows
,
int64_t
num_cols
,
dim3
*
block_dims
,
...
...
paddle/pten/kernels/gpu/copy_kernel.cu
浏览文件 @
a821c4a9
...
...
@@ -87,9 +87,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place
));
auto
stream
=
blocking
?
nullptr
:
reinterpret_cast
<
const
paddle
::
platform
::
CUDADeviceContext
&>
(
dev_ctx
)
.
stream
();
:
reinterpret_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
).
stream
();
paddle
::
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
paddle
::
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
...
...
@@ -113,9 +111,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place
));
auto
stream
=
blocking
?
nullptr
:
reinterpret_cast
<
const
paddle
::
platform
::
CUDADeviceContext
&>
(
dev_ctx
)
.
stream
();
:
reinterpret_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
).
stream
();
paddle
::
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
...
...
@@ -140,9 +136,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place
.
device
));
auto
stream
=
blocking
?
nullptr
:
reinterpret_cast
<
const
paddle
::
platform
::
CUDADeviceContext
&>
(
dev_ctx
)
.
stream
();
:
reinterpret_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
).
stream
();
paddle
::
memory
::
Copy
(
dst_cuda_pinned_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
paddle
::
platform
::
is_cuda_pinned_place
(
src_place
)
&&
// NOLINT
...
...
@@ -167,9 +161,7 @@ void Copy(const Context& dev_ctx,
ctx_gpu_place
.
device
));
auto
stream
=
blocking
?
nullptr
:
reinterpret_cast
<
const
paddle
::
platform
::
CUDADeviceContext
&>
(
dev_ctx
)
.
stream
();
:
reinterpret_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
).
stream
();
paddle
::
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cuda_pinned_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
...
...
@@ -185,9 +177,7 @@ void Copy(const Context& dev_ctx,
ctx_place
));
auto
stream
=
blocking
?
nullptr
:
reinterpret_cast
<
const
paddle
::
platform
::
CUDADeviceContext
&>
(
dev_ctx
)
.
stream
();
:
reinterpret_cast
<
const
pten
::
GPUContext
&>
(
dev_ctx
).
stream
();
if
(
paddle
::
platform
::
is_same_place
(
src_place
,
dst_place
))
{
paddle
::
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
...
...
paddle/pten/kernels/gpu/elementwise.h
浏览文件 @
a821c4a9
...
...
@@ -2020,7 +2020,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dout
,
dx
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
ctx
,
dout
,
dx
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
}
}
// dy
...
...
@@ -2038,7 +2038,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dout
,
dy
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
ctx
,
dout
,
dy
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
}
}
}
...
...
@@ -2137,7 +2137,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dout
,
dx
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
ctx
,
dout
,
dx
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
}
}
// dy
...
...
@@ -2161,7 +2161,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
T
,
kps
::
AddFunctor
,
kps
::
InverseFunctor
<
T
>>
(
dout
,
dy
,
kps
::
InverseFunctor
<
T
>
(),
reduce_dims
,
stream
);
ctx
,
dout
,
dy
,
kps
::
InverseFunctor
<
T
>
(),
reduce_dims
,
stream
);
}
}
}
...
...
paddle/pten/kernels/gpu/reduce.h
浏览文件 @
a821c4a9
...
...
@@ -1064,7 +1064,8 @@ template <typename Tx,
typename
Ty
,
template
<
typename
>
class
ReduceOp
,
typename
TransformOp
>
void
TensorReduceFunctorImpl
(
const
pten
::
DenseTensor
&
x
,
void
TensorReduceFunctorImpl
(
const
pten
::
GPUContext
&
dev_ctx
,
const
pten
::
DenseTensor
&
x
,
pten
::
DenseTensor
*
y
,
const
TransformOp
&
transform
,
const
std
::
vector
<
int
>&
origin_reduce_dims
,
...
...
@@ -1088,13 +1089,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
auto
x_data
=
x
.
data
<
Tx
>
();
auto
y_data
=
y
->
data
<
Ty
>
();
auto
*
dev_ctx
=
static_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
x
.
place
()));
if
(
config
.
reduce_num
==
1
)
{
std
::
vector
<
const
DenseTensor
*>
inputs
=
{
&
x
};
std
::
vector
<
DenseTensor
*>
outputs
=
{
y
};
funcs
::
LaunchSameDimsElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
Tx
,
Ty
>
(
*
dev_ctx
,
inputs
,
&
outputs
,
transform
);
dev_ctx
,
inputs
,
&
outputs
,
transform
);
return
;
}
...
...
@@ -1244,13 +1243,23 @@ void Reduce(const GPUContext& dev_ctx,
data_t
,
ReduceOp
,
TransformOp
<
T
,
MPType
>>
(
x
,
out
,
TransformOp
<
T
,
MPType
>
(
reduce_num
),
reduce_dims
,
stream
);
dev_ctx
,
x
,
out
,
TransformOp
<
T
,
MPType
>
(
reduce_num
),
reduce_dims
,
stream
);
}));
}
else
{
using
MPType
=
typename
kps
::
details
::
MPTypeTrait
<
T
>::
Type
;
pten
::
kernels
::
TensorReduceFunctorImpl
<
T
,
T
,
ReduceOp
,
TransformOp
<
T
,
MPType
>>
(
x
,
out
,
TransformOp
<
T
,
MPType
>
(
reduce_num
),
reduce_dims
,
stream
);
dev_ctx
,
x
,
out
,
TransformOp
<
T
,
MPType
>
(
reduce_num
),
reduce_dims
,
stream
);
}
}
}
// namespace pten
...
...
paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
浏览文件 @
a821c4a9
...
...
@@ -60,9 +60,11 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
DenseTensor
*
output
,
const
std
::
vector
<
int
>&
reduce_dims
)
{
auto
stream
=
dev_ctx
.
stream
();
kernels
::
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
input
,
output
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
kernels
::
TensorReduceFunctorImpl
<
T
,
T
,
kps
::
AddFunctor
,
kps
::
IdentityFunctor
<
T
>>
(
dev_ctx
,
input
,
output
,
kps
::
IdentityFunctor
<
T
>
(),
reduce_dims
,
stream
);
}
};
#endif
...
...
paddle/pten/tests/api/test_matmul_api.cc
浏览文件 @
a821c4a9
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/pten/api/include/api.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/kernels/copy_kernel.h"
...
...
@@ -122,7 +123,7 @@ TEST(API, matmul_cuda) {
auto
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
place
=
paddle
::
platform
::
CUDAPlace
();
auto
*
dev_ctx
=
pool
.
GetByPlace
(
place
);
auto
*
dev_ctx
=
static_cast
<
const
pten
::
GPUContext
*>
(
pool
.
GetByPlace
(
place
)
);
pten
::
Copy
(
*
dev_ctx
,
*
ref_x
.
get
(),
false
,
dense_x
.
get
());
pten
::
Copy
(
*
dev_ctx
,
*
ref_y
.
get
(),
false
,
dense_y
.
get
());
...
...
paddle/pten/tests/api/test_sparse_utils_api.cc
浏览文件 @
a821c4a9
...
...
@@ -47,6 +47,7 @@ TEST(API, to_sparse_coo) {
std
::
copy
(
&
dense_data
[
0
][
0
],
&
dense_data
[
0
][
0
]
+
9
,
dense_x_data
);
pten
::
CPUContext
dev_ctx_cpu
;
dev_ctx_cpu
.
Init
();
// 1. test dense_to_sparse_coo
paddle
::
experimental
::
Tensor
x
(
dense_x
);
...
...
paddle/pten/tests/core/test_device_context.cc
浏览文件 @
a821c4a9
...
...
@@ -25,43 +25,29 @@ limitations under the License. */
namespace
pten
{
namespace
tests
{
class
InferenceCPUContext
:
public
CPUContext
{
public:
void
SetEigenDevice
(
Eigen
::
DefaultDevice
*
eigen_device
)
{
CPUContext
::
SetEigenDevice
(
eigen_device
);
}
};
TEST
(
DeviceContext
,
cpu_context
)
{
std
::
cout
<<
"test training scenarios"
<<
std
::
endl
;
{
pten
::
CPUContext
ctx
;
ctx
.
Init
();
EXPECT_TRUE
(
ctx
.
eigen_device
()
!=
nullptr
);
}
std
::
cout
<<
"test inference scenarios"
<<
std
::
endl
;
Eigen
::
DefaultDevice
*
device
=
new
Eigen
::
DefaultDevice
();
{
pten
::
CPUContextResource
ctx_res
{
device
};
pten
::
CPUContext
ctx
(
ctx_res
);
EXPECT_TRUE
(
ctx
.
eigen_device
()
!=
nullptr
);
}
{
pten
::
CPUContextResource
ctx_res
{
nullptr
};
pten
::
CPUContext
ctx
(
ctx_res
);
InferenceCPUContext
ctx
;
ctx
.
SetEigenDevice
(
device
);
EXPECT_TRUE
(
ctx
.
eigen_device
()
!=
nullptr
);
}
delete
device
;
std
::
cout
<<
"test copy constructor"
<<
std
::
endl
;
{
pten
::
CPUContext
ctx1
;
pten
::
CPUContext
ctx2
(
ctx1
);
EXPECT_EQ
(
ctx1
.
eigen_device
(),
ctx2
.
eigen_device
());
}
std
::
cout
<<
"test move constructor"
<<
std
::
endl
;
{
pten
::
CPUContext
ctx1
=
pten
::
CPUContext
();
auto
*
eigen_device1
=
ctx1
.
eigen_device
();
pten
::
CPUContext
ctx2
(
std
::
move
(
ctx1
));
auto
*
eigen_device2
=
ctx2
.
eigen_device
();
EXPECT_EQ
(
eigen_device1
,
eigen_device2
);
}
}
}
// namespace tests
...
...
paddle/pten/tests/kernels/test_cast_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -50,10 +50,10 @@ TEST(DEV_API, cast) {
}
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
pten
::
DataType
out_dtype
=
pten
::
DataType
::
FLOAT64
;
// 2. test API
...
...
paddle/pten/tests/kernels/test_concat_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -59,10 +59,10 @@ TEST(DEV_API, concat) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
Concat
<
float
>
(
dev_ctx
,
inputs
,
0
);
// 3. check result
...
...
paddle/pten/tests/kernels/test_conj_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -46,10 +46,10 @@ TEST(DEV_API, conj) {
}
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
// 2. test API
auto
out
=
pten
::
Conj
<
paddle
::
complex64
>
(
dev_ctx
,
dense_x
);
...
...
paddle/pten/tests/kernels/test_copy_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -58,10 +58,10 @@ TEST(DEV_API, copy) {
std
::
cout
<<
typeid
(
a
).
name
()
<<
std
::
endl
;
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
pten
::
Copy
(
dev_ctx
,
*
(
dense_src
.
get
()),
false
,
dense_dst
.
get
());
// 3. check result
...
...
paddle/pten/tests/kernels/test_creation_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -33,10 +33,10 @@ using DDim = pten::framework::DDim;
TEST
(
DEV_API
,
empty
)
{
// 1. create input
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
// 2. test API
auto
out
=
pten
::
Empty
<
float
>
(
dev_ctx
,
{
3
,
2
},
pten
::
DataType
::
INT32
);
...
...
@@ -64,10 +64,10 @@ TEST(DEV_API, empty_like) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
EmptyLike
<
float
>
(
dev_ctx
,
dense_x
);
// 3. check result
...
...
@@ -84,10 +84,10 @@ TEST(DEV_API, full) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
Full
<
float
>
(
dev_ctx
,
{
3
,
2
},
val
,
pten
::
DataType
::
FLOAT32
);
// 3. check result
...
...
@@ -118,10 +118,10 @@ TEST(DEV_API, full_like) {
float
val
=
1.0
;
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
// 2. test API
auto
out
=
pten
::
FullLike
<
float
>
(
dev_ctx
,
dense_x
,
val
);
...
...
paddle/pten/tests/kernels/test_dot_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -60,10 +60,10 @@ TEST(DEV_API, dot) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
Dot
<
float
>
(
dev_ctx
,
dense_x
,
dense_y
);
// 3. check result
...
...
paddle/pten/tests/kernels/test_elementwise_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -62,10 +62,10 @@ TEST(DEV_API, add) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
dense_out
=
pten
::
Add
<
float
>
(
dev_ctx
,
dense_x
,
dense_y
);
// 3. check result
...
...
@@ -116,10 +116,10 @@ TEST(DEV_API, subtract) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
dense_out
=
pten
::
Subtract
<
float
>
(
dev_ctx
,
dense_x
,
dense_y
);
// 3. check result
...
...
@@ -170,10 +170,10 @@ TEST(DEV_API, divide) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
dense_out
=
pten
::
Divide
<
float
>
(
dev_ctx
,
dense_x
,
dense_y
);
// 3. check result
...
...
@@ -224,10 +224,10 @@ TEST(DEV_API, multiply) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
dense_out
=
pten
::
Multiply
<
float
>
(
dev_ctx
,
dense_x
,
dense_y
);
// 3. check result
...
...
paddle/pten/tests/kernels/test_flatten_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -56,10 +56,10 @@ TEST(DEV_API, flatten) {
}
int
start_axis
=
1
,
stop_axis
=
2
;
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
// 2. test API
auto
out
=
pten
::
Flatten
<
float
>
(
dev_ctx
,
dense_x
,
start_axis
,
stop_axis
);
...
...
paddle/pten/tests/kernels/test_matmul_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -55,10 +55,10 @@ TEST(DEV_API, dot) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
Matmul
<
float
,
CPUContext
>
(
dev_ctx
,
dense_x
,
dense_y
,
false
,
false
);
// 3. check result
...
...
paddle/pten/tests/kernels/test_mean_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -49,10 +49,10 @@ TEST(DEV_API, mean) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
Mean
<
float
>
(
dev_ctx
,
dense_x
,
dims
,
false
);
// 3. check result
...
...
paddle/pten/tests/kernels/test_reshape_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -48,10 +48,10 @@ TEST(DEV_API, reshape) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
Reshape
<
float
>
(
dev_ctx
,
dense_x
,
shape
);
// 3. check result
std
::
vector
<
int64_t
>
expect_shape
=
{
12
,
3
};
...
...
paddle/pten/tests/kernels/test_scale_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -49,10 +49,11 @@ TEST(DEV_API, scale) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
Scale
<
float
>
(
dev_ctx
,
dense_x
,
scale
,
bias
,
bias_after_scale
);
...
...
@@ -92,10 +93,11 @@ TEST(DEV_API, scale_host) {
// 2. test API
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
out
=
pten
::
Scale
<
float
>
(
dev_ctx
,
dense_x
,
scale
,
bias
,
bias_after_scale
);
...
...
paddle/pten/tests/kernels/test_sparse_utils_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/common/place.h"
#include "paddle/pten/kernels/copy_kernel.h"
#include "paddle/pten/kernels/sparse/sparse_utils_kernel.h"
...
...
@@ -22,6 +24,8 @@ limitations under the License. */
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
namespace
pten
{
namespace
tests
{
...
...
@@ -38,9 +42,8 @@ inline void CheckResult(
ASSERT_EQ
(
coo
.
nnz
(),
non_zero_num
);
#if defined(PADDLE_WITH_CUDA)
if
(
coo
.
place
()
==
paddle
::
platform
::
CUDAPlace
())
{
const
auto
*
dev_ctx_cuda
=
static_cast
<
const
paddle
::
platform
::
CUDADeviceContext
*>
(
dev_ctx
);
if
(
coo
.
place
()
==
pten
::
GPUPlace
())
{
const
auto
*
dev_ctx_cuda
=
static_cast
<
const
pten
::
GPUContext
*>
(
dev_ctx
);
DenseTensor
indices
(
alloc
.
get
(),
DenseTensorMeta
(
...
...
@@ -86,6 +89,8 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
paddle
::
platform
::
CPUPlace
());
pten
::
CPUContext
dev_ctx_cpu
;
dev_ctx_cpu
.
Init
();
// 1. test cpu
auto
cpu_sparse_out
=
sparse
::
DenseToSparseCoo
<
T
>
(
dev_ctx_cpu
,
dense_x
,
sparse_dim
);
...
...
@@ -98,9 +103,21 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
// 2. test cuda
#if defined(PADDLE_WITH_CUDA)
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx_cuda
=
pool
.
GetByPlace
(
paddle
::
platform
::
CUDAPlace
());
// paddle::platform::DeviceContextPool& pool =
// paddle::platform::DeviceContextPool::Instance();
// auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
pten
::
GPUContext
dev_ctx_gpu
;
dev_ctx_gpu
.
PartialInitWithoutAllocator
();
dev_ctx_gpu
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
dev_ctx_gpu
.
GetPlace
(),
dev_ctx_gpu
.
stream
())
.
get
());
dev_ctx_gpu
.
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
pten
::
CPUPlace
())
.
get
());
dev_ctx_gpu
.
PartialInitWithAllocator
();
const
auto
cuda_alloc
=
std
::
make_shared
<
paddle
::
experimental
::
DefaultAllocator
>
(
paddle
::
platform
::
CUDAPlace
());
...
...
@@ -108,10 +125,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
cuda_alloc
.
get
(),
DenseTensorMeta
(
dense_x
.
dtype
(),
dense_x
.
dims
(),
dense_x
.
layout
()));
pten
::
Copy
(
*
dev_ctx_cuda
,
dense_x
,
true
,
&
d_dense_x
);
pten
::
Copy
(
dev_ctx_gpu
,
dense_x
,
true
,
&
d_dense_x
);
auto
sparse_out
=
sparse
::
DenseToSparseCoo
<
T
>
(
*
dev_ctx_cuda
,
d_dense_x
,
sparse_dim
);
CheckResult
<
T
,
int64_t
>
(
dev_ctx_cuda
,
sparse
::
DenseToSparseCoo
<
T
>
(
dev_ctx_gpu
,
d_dense_x
,
sparse_dim
);
CheckResult
<
T
,
int64_t
>
(
&
dev_ctx_gpu
,
sparse_out
,
non_zero_data
,
indices_data
,
...
...
@@ -295,20 +312,32 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
alloc
);
// 2. test cuda
#if defined(PADDLE_WITH_CUDA)
pten
::
GPUContext
dev_ctx_gpu
;
dev_ctx_gpu
.
PartialInitWithoutAllocator
();
dev_ctx_gpu
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
dev_ctx_gpu
.
GetPlace
(),
dev_ctx_gpu
.
stream
())
.
get
());
dev_ctx_gpu
.
SetHostAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
pten
::
CPUPlace
())
.
get
());
dev_ctx_gpu
.
PartialInitWithAllocator
();
const
auto
cuda_alloc
=
std
::
make_shared
<
paddle
::
experimental
::
DefaultAllocator
>
(
paddle
::
platform
::
CUDAPlace
());
auto
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx_cuda
=
pool
.
GetByPlace
(
paddle
::
platform
::
CUDAPlace
());
//
auto& pool = paddle::platform::DeviceContextPool::Instance();
//
auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
pten
::
DenseTensor
d_crows
(
cuda_alloc
.
get
(),
crows_meta
);
pten
::
DenseTensor
d_cols
(
cuda_alloc
.
get
(),
cols_meta
);
pten
::
DenseTensor
d_values
(
cuda_alloc
.
get
(),
values_meta
);
pten
::
Copy
(
*
dev_ctx_cuda
,
crows
,
true
,
&
d_crows
);
pten
::
Copy
(
*
dev_ctx_cuda
,
cols
,
true
,
&
d_cols
);
pten
::
Copy
(
*
dev_ctx_cuda
,
values
,
true
,
&
d_values
);
pten
::
Copy
(
dev_ctx_gpu
,
crows
,
true
,
&
d_crows
);
pten
::
Copy
(
dev_ctx_gpu
,
cols
,
true
,
&
d_cols
);
pten
::
Copy
(
dev_ctx_gpu
,
values
,
true
,
&
d_values
);
pten
::
SparseCsrTensor
d_csr
(
d_crows
,
d_cols
,
d_values
,
dense_dims
);
auto
cuda_sparse_out
=
sparse
::
SparseCsrToCoo
<
T
>
(
*
dev_ctx_cuda
,
d_csr
);
CheckResult
<
T
,
int64_t
>
(
dev_ctx_cuda
,
auto
cuda_sparse_out
=
sparse
::
SparseCsrToCoo
<
T
>
(
dev_ctx_gpu
,
d_csr
);
CheckResult
<
T
,
int64_t
>
(
&
dev_ctx_gpu
,
cuda_sparse_out
,
non_zero_data
,
indices_data
,
...
...
paddle/pten/tests/kernels/test_sum_dev_api.cc
浏览文件 @
a821c4a9
...
...
@@ -47,10 +47,11 @@ TEST(DEV_API, sum) {
std
::
vector
<
int64_t
>
axis
=
{
0
,
1
};
pten
::
CPUContext
dev_ctx
;
dev_ctx
.
SetDeviceAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
paddle
::
platform
::
CPUPlace
())
.
get
());
dev_ctx
.
Init
();
// 2. test API
auto
out
=
pten
::
Sum
<
float
>
(
dev_ctx
,
dense_x
,
axis
,
pten
::
DataType
::
FLOAT32
,
false
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录