Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
88490567
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
88490567
编写于
7月 29, 2022
作者:
L
Leo Chen
提交者:
GitHub
7月 29, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
* remove cudaDeviceContext * remove more template * fix rocm compile
上级
0a2db7c8
变更
26
隐藏空白更改
内联
并排
Showing
26 changed file
with
122 addition
and
2801 deletion
+122
-2801
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+0
-6
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
...e/fluid/memory/allocation/cuda_device_context_allocator.h
+0
-5
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+0
-7
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+4
-3
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+0
-7
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+0
-5
paddle/fluid/operators/math/im2col.cu
paddle/fluid/operators/math/im2col.cu
+0
-24
paddle/fluid/operators/math/maxouting.cu
paddle/fluid/operators/math/maxouting.cu
+0
-6
paddle/fluid/operators/math/sample_prob.h
paddle/fluid/operators/math/sample_prob.h
+0
-6
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+2
-159
paddle/fluid/operators/math/sequence_padding.cu
paddle/fluid/operators/math/sequence_padding.cu
+0
-159
paddle/fluid/operators/math/sequence_scale.cu
paddle/fluid/operators/math/sequence_scale.cu
+0
-40
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+0
-35
paddle/fluid/operators/math/vol2col.cu
paddle/fluid/operators/math/vol2col.cu
+0
-4
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+0
-6
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+0
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+0
-5
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+1
-9
paddle/fluid/platform/transform.h
paddle/fluid/platform/transform.h
+0
-60
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+80
-1263
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+22
-906
paddle/phi/kernels/funcs/fc_functor.cu
paddle/phi/kernels/funcs/fc_functor.cu
+0
-4
paddle/phi/kernels/funcs/for_range.h
paddle/phi/kernels/funcs/for_range.h
+0
-16
paddle/phi/kernels/funcs/math_function.cu
paddle/phi/kernels/funcs/math_function.cu
+13
-56
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+0
-5
paddle/phi/kernels/funcs/matrix_solve.cu
paddle/phi/kernels/funcs/matrix_solve.cu
+0
-4
未找到文件。
paddle/fluid/framework/details/eager_deletion_op_handle.h
浏览文件 @
88490567
...
...
@@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
framework
{
class
GarbageCollector
;
...
...
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
浏览文件 @
88490567
...
...
@@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
namespace
memory
{
namespace
allocation
{
...
...
paddle/fluid/operators/cudnn_lstm_op.cu.cc
浏览文件 @
88490567
...
...
@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
浏览文件 @
88490567
...
...
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
// first sum pool
FusedSeqpoolKernelNormal
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
...
...
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
(
embedding_size
-
cvm_offset
));
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
FusedCVMKernelNoCVM
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
...
...
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
auto
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
auto
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
if
(
use_cvm
)
{
// join grad
FusedSeqpoolCVMGradKernelWithCVM
<<<
config
.
block_per_grid
.
x
,
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
88490567
...
...
@@ -14,13 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
88490567
...
...
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
float
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
double
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
platform
::
float16
>;
...
...
paddle/fluid/operators/math/im2col.cu
浏览文件 @
88490567
...
...
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
float
>;
...
...
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
float
>;
...
...
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi
::
GPUContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
float
>;
...
...
paddle/fluid/operators/math/maxouting.cu
浏览文件 @
88490567
...
...
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis
);
}
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/sample_prob.h
浏览文件 @
88490567
...
...
@@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
88490567
...
...
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
const
framework
::
Tensor
&
input2
,
framework
::
Tensor
*
output
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
.
dims
();
auto
out_dims
=
output
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
in1_height
,
out_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]"
,
in1_height
,
out_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
.
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
.
numel
()
/
in1_height
));
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
output
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]"
,
in1_row_numel
,
output
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
.
data
<
T
>
();
auto
*
out_data
=
output
->
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
static_cast
<
T
>
(
0
));
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
out_data
,
in1_row_numel
);
auto
out_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
output
);
auto
in2_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
input2
);
out_eigen
.
device
(
*
context
.
eigen_device
())
=
out_eigen
+
in2_eigen
;
}
};
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
...
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
};
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAdd
<
phi
::
GPUContext
,
platform
::
float16
>;
...
...
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
framework
::
Tensor
*
input2
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
->
data
<
T
>
();
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddToTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
in2_data
,
in1_row_numel
);
}
};
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
...
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
};
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
int
>;
...
...
@@ -625,34 +498,6 @@ struct MergeAddImpl {
}
};
template
<
typename
T
>
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi
::
SelectedRows
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
const
bool
sorted_result
)
{
return
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
output
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
vector
<
const
phi
::
SelectedRows
*>&
inputs
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
inputs
,
output
,
sorted_result
);
}
};
template
<
typename
T
>
struct
MergeAdd
<
phi
::
GPUContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
...
...
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
}
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD
(
float
)
...
...
paddle/fluid/operators/math/sequence_padding.cu
浏览文件 @
88490567
...
...
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
}
}
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
seq_tensor
,
framework
::
LoDTensor
*
pad_tensor
,
const
framework
::
LoDTensor
&
pad_value
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_lod
=
seq_tensor
.
lod
();
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_lod
)[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
.
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
->
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
PADDLE_ENFORCE_GE
(
pad_seq_len
,
max_seq_len
,
platform
::
errors
::
InvalidArgument
(
"The pad_seq_len must be equal to or greater than the "
"original max sequence length. Expected %ld >= %ld, but got %ld < "
"%ld. Please check the input value."
,
pad_seq_len
,
max_seq_len
,
pad_seq_len
,
max_seq_len
));
int
step_width
=
seq_tensor
.
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
PADDLE_ENFORCE_EQ
(
pad_value
.
numel
()
==
1
||
pad_value
.
numel
()
==
step_width
,
true
,
platform
::
errors
::
InvalidArgument
(
"The numel of 'pad_value' can only be 1 or be equal to "
"the 'step_width', but got %ld != 1 and %ld. Please check the "
"input value."
,
pad_value
.
numel
(),
step_width
));
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
seq_data
=
seq_tensor
.
data
<
T
>
();
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kSeqToPad
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
pad_data
,
seq_data
,
pad_value_data
,
pad_value
.
numel
()
==
1
,
mix_vector_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
...
...
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
pad_tensor
,
framework
::
LoDTensor
*
seq_tensor
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_tensor
->
lod
())[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
->
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
.
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
int
step_width
=
seq_tensor
->
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
/*
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
return;
}
*/
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
pad_data
=
pad_tensor
.
data
<
T
>
();
T
*
seq_data
=
seq_tensor
->
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kPadToSeq
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
seq_data
,
pad_data
,
nullptr
,
false
,
mixv_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
...
...
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
...
...
paddle/fluid/operators/math/sequence_scale.cu
浏览文件 @
88490567
...
...
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
}
}
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
T
*
scales
,
framework
::
LoDTensor
*
seq
)
{
const
size_t
level
=
0
;
auto
lod
=
seq
->
lod
();
const
size_t
num_seq
=
lod
[
level
].
size
()
-
1
;
const
size_t
seq_width
=
seq
->
numel
()
/
seq
->
dims
()[
0
];
auto
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
T
*
seq_data
=
seq
->
mutable_data
<
T
>
(
context
.
GetPlace
());
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
),
dim3
(
num_seq
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
context
.
stream
(),
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#else
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
<<<
num_seq
,
PADDLE_CUDA_NUM_THREADS
,
0
,
context
.
stream
()
>>>
(
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#endif
mix_vector
.
CopyToCPU
();
}
};
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
...
...
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
}
};
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
88490567
...
...
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
#endif
}
template
class
SoftmaxCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
#if CUDNN_VERSION_MIN(8, 1, 0)
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
#endif
// MIOPEN do not support double
#ifndef PADDLE_WITH_HIP
template
class
SoftmaxCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
#endif
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
true
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
bfloat16
,
false
>;
...
...
paddle/fluid/operators/math/vol2col.cu
浏览文件 @
88490567
...
...
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
}
// };
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
浏览文件 @
88490567
...
...
@@ -16,12 +16,6 @@
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
sequence_concat
,
paddle
::
operators
::
SeqConcatKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
88490567
...
...
@@ -51,7 +51,6 @@ namespace platform {
//
// The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id.
class
CUDADeviceContext
;
class
NCCLComm
{
public:
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
88490567
...
...
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_
=
memory
::
Alloc
(
device_context_
,
required_workspace_bytes
);
}
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
phi
::
GPUContext
(
place
)
{}
CUDADeviceContext
::~
CUDADeviceContext
()
=
default
;
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
88490567
...
...
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
class
CudnnWorkspaceHandle
;
class
EigenCudaStreamDevice
;
class
CUDADeviceContext
:
public
phi
::
GPUContext
{
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
virtual
~
CUDADeviceContext
();
private:
int
place_holder_
;
// TO BE REMOVED
DISABLE_COPY_AND_ASSIGN
(
CUDADeviceContext
);
};
using
CUDADeviceContext
=
phi
::
GPUContext
;
class
CudnnWorkspaceHandle
{
public:
...
...
paddle/fluid/platform/transform.h
浏览文件 @
88490567
...
...
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
};
#if defined(__NVCC__) || defined(__HIPCC__)
template
<
>
struct
Transform
<
platform
::
CUDADeviceContext
>
{
template
<
typename
InputIter
,
typename
OutputIter
,
typename
UnaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter
first
,
InputIter
last
,
OutputIter
result
,
UnaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
template
<
typename
InputIter1
,
typename
InputIter2
,
typename
OutputIter
,
typename
BinaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter1
first1
,
InputIter1
last1
,
InputIter2
first2
,
OutputIter
result
,
BinaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
};
template
<
>
struct
Transform
<
phi
::
GPUContext
>
{
...
...
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
浏览文件 @
88490567
...
...
@@ -66,58 +66,6 @@ struct CUBlas<float> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
float
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
)
{
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
#if CUDA_VERSION >= 8000
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
dev_ctx
->
tensor_core_available
()
?
"True"
:
"False"
);
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasSgemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasSgemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
...
...
@@ -366,66 +314,6 @@ struct CUBlas<phi::dtype::float16> {
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
...
...
@@ -636,66 +524,6 @@ struct CUBlas<phi::dtype::complex<float>> {
ldb
));
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
...
...
@@ -965,66 +793,6 @@ struct CUBlas<phi::dtype::complex<double>> {
batch_size
));
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
...
...
@@ -1088,16 +856,16 @@ struct CUBlas<phi::dtype::complex<double>> {
template
<
>
template
<
typename
T
>
void
Blas
<
p
addle
::
platform
::
CUDADevice
Context
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
void
Blas
<
p
hi
::
GPU
Context
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
...
@@ -1109,8 +877,7 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
...
...
@@ -1152,151 +919,6 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
N
);
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16F
,
ldb
,
A
,
CUDA_R_16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
CUDA_R_32F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
h_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1376,77 +998,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
#if CUDA_VERSION >= 11000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
80
,
phi
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 80,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16BF
,
ldb
,
A
,
CUDA_R_16BF
,
lda
,
&
h_beta
,
C
,
CUDA_R_16BF
,
N
,
CUDA_R_32F
,
algo
));
});
#else
// raise error
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx with bfloat16 is not supported on cuda <= 11"
));
#endif // CUDA_VERSION >= 11000
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1517,87 +1068,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 11000
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
float
>
alpha
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
CUDA_C_32F
,
ldb
,
A
,
CUDA_C_32F
,
lda
,
&
c_beta
,
C
,
CUDA_C_32F
,
N
,
CUDA_C_32F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
c_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1680,17 +1150,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
...
@@ -1719,7 +1188,7 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
p
addle
::
platform
::
CUDADevice
Context
&>
(
context_
);
auto
&
cuda_ctx
=
const_cast
<
p
hi
::
GPU
Context
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
...
...
@@ -1760,153 +1229,6 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
CUDA_C_64F
,
ldb
,
A
,
CUDA_C_64F
,
lda
,
&
c_beta
,
C
,
CUDA_C_64F
,
N
,
CUDA_C_64F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
c_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
);
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
...
@@ -1972,45 +1294,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
int
lda
,
const
phi
::
dtype
::
float16
*
B
,
int
ldb
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
...
@@ -2049,17 +1332,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
...
...
@@ -2068,15 +1340,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
...
...
@@ -2086,247 +1349,67 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
// it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
// it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
#if CUDA_VERSION >= 9010
if
((
FLAGS_enable_cublas_tensor_op_math
&&
(
std
::
is_same
<
T
,
float
>::
value
))
||
std
::
is_same
<
T
,
phi
::
dtype
::
float16
>::
value
)
{
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
VLOG
(
4
)
<<
"use_half_precision_compute_type: "
<<
FLAGS_gemm_use_half_precision_compute_type
;
auto
fp
=
std
::
is_same
<
T
,
float
>::
value
?
CUDA_R_32F
:
CUDA_R_16F
;
cudaDataType_t
compute_type
=
fp
;
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
void
*
a
=
static_cast
<
void
*>
(
&
h_alpha
);
void
*
b
=
static_cast
<
void
*>
(
&
h_beta
);
// set ComputeType as CUDA_R_32F for fp16, for better accuracy
if
(
FLAGS_gemm_use_half_precision_compute_type
==
true
&&
std
::
is_same
<
T
,
phi
::
dtype
::
float16
>::
value
)
{
a
=
static_cast
<
void
*>
(
&
alpha
);
b
=
static_cast
<
void
*>
(
&
beta
);
compute_type
=
CUDA_R_16F
;
}
// set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
a
,
B
,
fp
,
ldb
,
strideB
,
A
,
fp
,
lda
,
strideA
,
b
,
C
,
fp
,
ldc
,
strideC
,
batchCount
,
compute_type
,
algo
));
});
}
else
{
#endif // CUDA_VERSION >= 9010
void
Blas
<
phi
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
});
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
#if CUDA_VERSION >= 9010
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
// it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
#endif // CUDA_VERSION >= 9010
}
template
<
>
...
...
@@ -2438,78 +1521,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 9010
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
#if CUDA_VERSION >= 11000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16BF
,
ldb
,
strideB
,
A
,
CUDA_R_16BF
,
lda
,
strideA
,
&
h_beta
,
C
,
CUDA_R_16BF
,
ldc
,
strideC
,
batchCount
,
CUBLAS_COMPUTE_32F
,
algo
));
});
#else
// raise error
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
"11"
));
#endif // CUDA_VERSION >= 11000
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -2582,26 +1593,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 11000
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
**
A
,
const
T
**
B
,
T
beta
,
T
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -2621,26 +1612,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
**
A
,
const
phi
::
dtype
::
float16
**
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -2660,26 +1631,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
**
A
,
const
phi
::
dtype
::
bfloat16
**
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -2699,37 +1650,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
int
lda
,
T
*
B
,
int
ldb
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t
cuSide
=
(
side
==
CblasLeft
)
?
CUBLAS_SIDE_RIGHT
:
CUBLAS_SIDE_LEFT
;
cublasFillMode_t
cuUplo
=
(
uplo
==
CblasLower
)
?
CUBLAS_FILL_MODE_UPPER
:
CUBLAS_FILL_MODE_LOWER
;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasDiagType_t
cuDiag
=
(
diag
==
CblasUnit
)
?
CUBLAS_DIAG_UNIT
:
CUBLAS_DIAG_NON_UNIT
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
TRSM
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
...
...
@@ -2761,15 +1681,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRF
(
int
n
,
T
**
a
,
int
*
ipiv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRF_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRF
(
...
...
@@ -2779,25 +1690,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRI
(
int
n
,
const
T
**
a
,
const
int
*
ipiv
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
PADDLE_ENFORCE_NE
(
a_inv
,
a
,
phi
::
errors
::
InvalidArgument
(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p)."
,
a_inv
,
a
));
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRI_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRI
(
int
n
,
...
...
@@ -2820,15 +1712,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedMatInv
(
int
n
,
const
T
**
a
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
MATINV_BATCH
(
handle
,
n
,
a
,
n
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedMatInv
(
...
...
@@ -2838,28 +1721,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
int
n
,
int
nrhs
,
const
T
**
a
,
int
lda
,
int
*
ipiv
,
T
**
b
,
int
ldb
,
int
*
info
,
int
batch_size
)
const
{
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTrans
=
(
trans
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRS_BATCH
(
handle
,
cuTrans
,
n
,
nrhs
,
a
,
lda
,
ipiv
,
b
,
ldb
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
...
...
@@ -2881,50 +1742,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
**
A
,
int
lda
,
T
**
B
,
int
ldb
,
int
batch_size
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t
cuSide
=
(
side
==
CblasLeft
)
?
CUBLAS_SIDE_RIGHT
:
CUBLAS_SIDE_LEFT
;
cublasFillMode_t
cuUplo
=
(
uplo
==
CblasLower
)
?
CUBLAS_FILL_MODE_UPPER
:
CUBLAS_FILL_MODE_LOWER
;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasDiagType_t
cuDiag
=
(
diag
==
CblasUnit
)
?
CUBLAS_DIAG_UNIT
:
CUBLAS_DIAG_NON_UNIT
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
TRSM_BATCH
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
...
...
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
浏览文件 @
88490567
...
...
@@ -257,54 +257,6 @@ struct CUBlas<phi::dtype::float16> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
...
...
@@ -474,54 +426,6 @@ struct CUBlas<phi::dtype::complex<float>> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
...
...
@@ -692,54 +596,6 @@ struct CUBlas<phi::dtype::complex<double>> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
...
...
@@ -789,45 +645,6 @@ struct CUBlas<phi::dtype::complex<double>> {
}
};
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -868,62 +685,6 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
});
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
rocblas_datatype_f16_r
,
ldb
,
A
,
rocblas_datatype_f16_r
,
lda
,
&
h_beta
,
C
,
rocblas_datatype_f16_r
,
N
,
rocblas_datatype_f32_r
);
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -982,17 +743,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
...
@@ -1052,11 +812,11 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
phi
::
dtype
::
complex
<
float
>
alpha
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
...
@@ -1067,140 +827,19 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
80
,
53
,
phi
::
errors
::
InvalidArgument
(
"
rocblas fp16 gemm requires GPU compute capability >= 80
,"
"
cublas complex64 gemm requires GPU compute capability >= 53
,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
rocblas_datatype_bf16_r
,
ldb
,
A
,
rocblas_datatype_bf16_r
,
lda
,
&
h_beta
,
C
,
rocblas_datatype_bf16_r
,
N
,
C
,
rocblas_datatype_bf16_r
,
N
,
rocblas_datatype_f32_r
,
algo
,
0
,
0
));
});
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
float
>
alpha
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
rocblas_datatype_f32_c
,
ldb
,
A
,
rocblas_datatype_f32_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f32_c
,
N
,
rocblas_datatype_f32_c
);
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
float
>
alpha
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
...
...
@@ -1223,64 +862,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_datatype_f32_c
);
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
rocblas_datatype_f64_c
,
ldb
,
A
,
rocblas_datatype_f64_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f64_c
,
N
,
rocblas_datatype_f64_c
);
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1339,44 +920,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_datatype_f64_c
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation
cuTransA
=
transA
?
rocblas_operation_transpose
:
rocblas_operation_none
;
rocblas_operation
cuTransB
=
transB
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
...
@@ -1416,46 +959,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
});
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
int
lda
,
const
phi
::
dtype
::
float16
*
B
,
int
ldb
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation
cuTransA
=
transA
?
rocblas_operation_transpose
:
rocblas_operation_none
;
rocblas_operation
cuTransB
=
transB
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
...
@@ -1496,16 +999,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
...
...
@@ -1514,14 +1007,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
...
...
@@ -1529,14 +1014,6 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
...
...
@@ -1544,23 +1021,6 @@ void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
rocblas_operation
cuTransA
=
!
trans_a
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
...
@@ -1579,26 +1039,6 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
});
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
...
@@ -1619,26 +1059,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
...
@@ -1659,56 +1079,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
const
int64_t
strideC
=
M
*
N
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1758,71 +1128,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
});
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
const
int64_t
strideC
=
M
*
N
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_strided_batched_ex
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
rocblas_datatype_bf16_r
,
ldb
,
strideB
,
A
,
rocblas_datatype_bf16_r
,
lda
,
strideA
,
&
h_beta
,
C
,
rocblas_datatype_bf16_r
,
ldc
,
strideC
,
C
,
rocblas_datatype_bf16_r
,
ldc
,
strideC
,
batchCount
,
rocblas_datatype_f32_r
,
algo
,
0
,
0
));
});
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1887,26 +1192,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
**
A
,
const
T
**
B
,
T
beta
,
T
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1926,25 +1211,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
**
A
,
const
phi
::
dtype
::
float16
**
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -1964,26 +1230,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
**
A
,
const
phi
::
dtype
::
bfloat16
**
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
@@ -2003,37 +1249,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
int
lda
,
T
*
B
,
int
ldb
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side
cuSide
=
(
side
==
CblasLeft
)
?
rocblas_side_right
:
rocblas_side_left
;
rocblas_fill
cuUplo
=
(
uplo
==
CblasLower
)
?
rocblas_fill_upper
:
rocblas_fill_lower
;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_diagonal
cuDiag
=
(
diag
==
CblasUnit
)
?
rocblas_diagonal_unit
:
rocblas_diagonal_non_unit
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
TRSM
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
...
...
@@ -2066,14 +1281,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRF
(
int
n
,
T
**
a
,
int
*
ipiv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRF_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRF
(
...
...
@@ -2083,24 +1290,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRI
(
int
n
,
const
T
**
a
,
const
int
*
ipiv
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
PADDLE_ENFORCE_NE
(
a_inv
,
a
,
phi
::
errors
::
InvalidArgument
(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p)."
,
a_inv
,
a
));
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRI_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRI
(
int
n
,
...
...
@@ -2123,14 +1312,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedMatInv
(
int
n
,
const
T
**
a
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
MATINV_BATCH
(
handle
,
n
,
a
,
n
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedMatInv
(
...
...
@@ -2140,27 +1321,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
int
n
,
int
nrhs
,
const
T
**
a
,
int
lda
,
int
*
ipiv
,
T
**
b
,
int
ldb
,
int
*
info
,
int
batch_size
)
const
{
rocblas_operation
cuTrans
=
(
trans
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRS_BATCH
(
handle
,
cuTrans
,
n
,
nrhs
,
a
,
lda
,
ipiv
,
b
,
ldb
,
info
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
...
...
@@ -2182,50 +1342,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
**
A
,
int
lda
,
T
**
B
,
int
ldb
,
int
batch_size
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side
cuSide
=
(
side
==
CblasLeft
)
?
rocblas_side_right
:
rocblas_side_left
;
rocblas_fill
cuUplo
=
(
uplo
==
CblasLower
)
?
rocblas_fill_upper
:
rocblas_fill_lower
;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_diagonal
cuDiag
=
(
diag
==
CblasUnit
)
?
rocblas_diagonal_unit
:
rocblas_diagonal_non_unit
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
TRSM_BATCH
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
batch_size
);
});
}
template
<
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
...
...
paddle/phi/kernels/funcs/fc_functor.cu
浏览文件 @
88490567
...
...
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
AddReluKernel
(
context
.
stream
(),
M
,
N
,
Y
,
B
,
relu
);
}
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float16
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
class
FCFunctor
<
GPUContext
,
float16
>;
template
class
FCFunctor
<
GPUContext
,
float
>;
template
class
FCFunctor
<
GPUContext
,
double
>;
...
...
paddle/phi/kernels/funcs/for_range.h
浏览文件 @
88490567
...
...
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
size_t
limit_
;
};
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template
<
>
struct
ForRange
<
paddle
::
platform
::
CUDADeviceContext
>
{
ForRange
(
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx
,
size_t
limit
)
:
dev_ctx_
(
dev_ctx
),
limit_
(
limit
)
{}
template
<
typename
Function
>
inline
void
operator
()(
Function
func
)
const
{
phi
::
funcs
::
ForRange
<
phi
::
GPUContext
>
for_range
(
dev_ctx_
,
limit_
);
for_range
(
func
);
}
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx_
;
size_t
limit_
;
};
#endif
}
// namespace funcs
...
...
paddle/phi/kernels/funcs/math_function.cu
浏览文件 @
88490567
...
...
@@ -31,22 +31,6 @@ namespace funcs {
using
float16
=
phi
::
dtype
::
float16
;
using
bfloat16
=
phi
::
dtype
::
bfloat16
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int16_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
bool
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
float
>
>
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
double
>
>
;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
float
>;
...
...
@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
template
struct
SetConstant
<
paddle
::
platform
::
CUDAPinnedDeviceContext
,
phi
::
dtype
::
complex
<
double
>
>
;
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
double, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
float16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
bfloat16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int8_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int32_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int64_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<double>, \
RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
DEFINE_GPU_TRANS
(
1
);
...
...
@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
};
// define transpose normal
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<phi::GPUContext, TYPE>
DEFINE_GPU_TRANS_NORMAL
(
float16
);
...
...
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
浏览文件 @
88490567
...
...
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
template
class
MatrixInverseFunctor
<
GPUContext
,
float
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
double
>;
// TODO(chenweihang): remove these instantiations later
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/funcs/matrix_solve.cu
浏览文件 @
88490567
...
...
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
template
class
MatrixSolveFunctor
<
GPUContext
,
float
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
double
>;
// TODO(wuweilong): remove these instantiations later
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace phi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录