Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
88490567
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
88490567
编写于
7月 29, 2022
作者:
L
Leo Chen
提交者:
GitHub
7月 29, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
* remove cudaDeviceContext * remove more template * fix rocm compile
上级
0a2db7c8
变更
26
显示空白变更内容
内联
并排
Showing
26 changed file
with
122 addition
and
2801 deletion
+122
-2801
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+0
-6
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
...e/fluid/memory/allocation/cuda_device_context_allocator.h
+0
-5
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+0
-7
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+4
-3
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+0
-7
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+0
-5
paddle/fluid/operators/math/im2col.cu
paddle/fluid/operators/math/im2col.cu
+0
-24
paddle/fluid/operators/math/maxouting.cu
paddle/fluid/operators/math/maxouting.cu
+0
-6
paddle/fluid/operators/math/sample_prob.h
paddle/fluid/operators/math/sample_prob.h
+0
-6
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+2
-159
paddle/fluid/operators/math/sequence_padding.cu
paddle/fluid/operators/math/sequence_padding.cu
+0
-159
paddle/fluid/operators/math/sequence_scale.cu
paddle/fluid/operators/math/sequence_scale.cu
+0
-40
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+0
-35
paddle/fluid/operators/math/vol2col.cu
paddle/fluid/operators/math/vol2col.cu
+0
-4
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+0
-6
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+0
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+0
-5
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+1
-9
paddle/fluid/platform/transform.h
paddle/fluid/platform/transform.h
+0
-60
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+80
-1263
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+22
-906
paddle/phi/kernels/funcs/fc_functor.cu
paddle/phi/kernels/funcs/fc_functor.cu
+0
-4
paddle/phi/kernels/funcs/for_range.h
paddle/phi/kernels/funcs/for_range.h
+0
-16
paddle/phi/kernels/funcs/math_function.cu
paddle/phi/kernels/funcs/math_function.cu
+13
-56
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+0
-5
paddle/phi/kernels/funcs/matrix_solve.cu
paddle/phi/kernels/funcs/matrix_solve.cu
+0
-4
未找到文件。
paddle/fluid/framework/details/eager_deletion_op_handle.h
浏览文件 @
88490567
...
@@ -23,12 +23,6 @@
...
@@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
class
GarbageCollector
;
class
GarbageCollector
;
...
...
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
浏览文件 @
88490567
...
@@ -25,11 +25,6 @@
...
@@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
namespace
memory
{
namespace
memory
{
namespace
allocation
{
namespace
allocation
{
...
...
paddle/fluid/operators/cudnn_lstm_op.cu.cc
浏览文件 @
88490567
...
@@ -23,13 +23,6 @@ limitations under the License. */
...
@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif
#endif
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
浏览文件 @
88490567
...
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
...
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
// first sum pool
// first sum pool
FusedSeqpoolKernelNormal
<<<
config
.
block_per_grid
.
x
,
FusedSeqpoolKernelNormal
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
config
.
thread_per_block
.
x
,
...
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
...
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
// not need show click input
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
(
embedding_size
-
cvm_offset
));
(
embedding_size
-
cvm_offset
));
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
FusedCVMKernelNoCVM
<<<
config
.
block_per_grid
.
x
,
FusedCVMKernelNoCVM
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
config
.
thread_per_block
.
x
,
0
,
0
,
...
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
...
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
auto
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
auto
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
if
(
use_cvm
)
{
if
(
use_cvm
)
{
// join grad
// join grad
FusedSeqpoolCVMGradKernelWithCVM
<<<
config
.
block_per_grid
.
x
,
FusedSeqpoolCVMGradKernelWithCVM
<<<
config
.
block_per_grid
.
x
,
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
88490567
...
@@ -14,13 +14,6 @@ limitations under the License. */
...
@@ -14,13 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
#include "paddle/fluid/operators/gru_op.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
88490567
...
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
...
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}
}
}
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
float
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
float
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
double
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
double
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
platform
::
float16
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
platform
::
float16
>;
...
...
paddle/fluid/operators/math/im2col.cu
浏览文件 @
88490567
...
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
...
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
}
};
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
phi
::
GPUContext
,
double
>;
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
...
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
...
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
}
};
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
...
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
...
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi
::
GPUContext
,
phi
::
GPUContext
,
double
>;
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
...
...
paddle/fluid/operators/math/maxouting.cu
浏览文件 @
88490567
...
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
...
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis
);
axis
);
}
}
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
double
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/sample_prob.h
浏览文件 @
88490567
...
@@ -22,12 +22,6 @@ limitations under the License. */
...
@@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/ddim.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
88490567
...
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
...
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
}
}
// namespace
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
const
framework
::
Tensor
&
input2
,
framework
::
Tensor
*
output
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
.
dims
();
auto
out_dims
=
output
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
in1_height
,
out_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]"
,
in1_height
,
out_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
.
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
.
numel
()
/
in1_height
));
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
output
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]"
,
in1_row_numel
,
output
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
.
data
<
T
>
();
auto
*
out_data
=
output
->
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
static_cast
<
T
>
(
0
));
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
out_data
,
in1_row_numel
);
auto
out_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
output
);
auto
in2_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
input2
);
out_eigen
.
device
(
*
context
.
eigen_device
())
=
out_eigen
+
in2_eigen
;
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
T
>
{
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
...
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
}
};
};
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAdd
<
phi
::
GPUContext
,
platform
::
float16
>;
template
struct
SelectedRowsAdd
<
phi
::
GPUContext
,
platform
::
float16
>;
...
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
...
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
}
}
// namespace
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
framework
::
Tensor
*
input2
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
->
data
<
T
>
();
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddToTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
in2_data
,
in1_row_numel
);
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
T
>
{
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
...
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
}
};
};
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
int
>;
...
@@ -625,34 +498,6 @@ struct MergeAddImpl {
...
@@ -625,34 +498,6 @@ struct MergeAddImpl {
}
}
};
};
template
<
typename
T
>
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi
::
SelectedRows
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
const
bool
sorted_result
)
{
return
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
output
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
vector
<
const
phi
::
SelectedRows
*>&
inputs
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
inputs
,
output
,
sorted_result
);
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
MergeAdd
<
phi
::
GPUContext
,
T
>
{
struct
MergeAdd
<
phi
::
GPUContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
// unary functor, merge by adding duplicated rows in
...
@@ -679,9 +524,7 @@ struct MergeAdd<phi::GPUContext, T> {
...
@@ -679,9 +524,7 @@ struct MergeAdd<phi::GPUContext, T> {
};
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;
template struct MergeAdd<phi::GPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD
(
float
)
TEMPLATE_SPECIALIZED_FOR_MERGEADD
(
float
)
...
...
paddle/fluid/operators/math/sequence_padding.cu
浏览文件 @
88490567
...
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
...
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
}
}
}
}
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
seq_tensor
,
framework
::
LoDTensor
*
pad_tensor
,
const
framework
::
LoDTensor
&
pad_value
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_lod
=
seq_tensor
.
lod
();
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_lod
)[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
.
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
->
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
PADDLE_ENFORCE_GE
(
pad_seq_len
,
max_seq_len
,
platform
::
errors
::
InvalidArgument
(
"The pad_seq_len must be equal to or greater than the "
"original max sequence length. Expected %ld >= %ld, but got %ld < "
"%ld. Please check the input value."
,
pad_seq_len
,
max_seq_len
,
pad_seq_len
,
max_seq_len
));
int
step_width
=
seq_tensor
.
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
PADDLE_ENFORCE_EQ
(
pad_value
.
numel
()
==
1
||
pad_value
.
numel
()
==
step_width
,
true
,
platform
::
errors
::
InvalidArgument
(
"The numel of 'pad_value' can only be 1 or be equal to "
"the 'step_width', but got %ld != 1 and %ld. Please check the "
"input value."
,
pad_value
.
numel
(),
step_width
));
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
seq_data
=
seq_tensor
.
data
<
T
>
();
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kSeqToPad
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
pad_data
,
seq_data
,
pad_value_data
,
pad_value
.
numel
()
==
1
,
mix_vector_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
public:
...
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
...
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
}
}
};
};
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
pad_tensor
,
framework
::
LoDTensor
*
seq_tensor
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_tensor
->
lod
())[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
->
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
.
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
int
step_width
=
seq_tensor
->
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
/*
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
return;
}
*/
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
pad_data
=
pad_tensor
.
data
<
T
>
();
T
*
seq_data
=
seq_tensor
->
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kPadToSeq
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
seq_data
,
pad_data
,
nullptr
,
false
,
mixv_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
class
UnpaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
public:
...
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
...
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
}
}
};
};
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
...
...
paddle/fluid/operators/math/sequence_scale.cu
浏览文件 @
88490567
...
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
...
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
}
}
}
}
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
T
*
scales
,
framework
::
LoDTensor
*
seq
)
{
const
size_t
level
=
0
;
auto
lod
=
seq
->
lod
();
const
size_t
num_seq
=
lod
[
level
].
size
()
-
1
;
const
size_t
seq_width
=
seq
->
numel
()
/
seq
->
dims
()[
0
];
auto
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
T
*
seq_data
=
seq
->
mutable_data
<
T
>
(
context
.
GetPlace
());
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
),
dim3
(
num_seq
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
context
.
stream
(),
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#else
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
<<<
num_seq
,
PADDLE_CUDA_NUM_THREADS
,
0
,
context
.
stream
()
>>>
(
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#endif
mix_vector
.
CopyToCPU
();
}
};
template
<
typename
T
>
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
public:
...
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
...
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
}
}
};
};
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
double
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
88490567
...
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
...
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
#endif
#endif
}
}
template
class
SoftmaxCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
#if CUDNN_VERSION_MIN(8, 1, 0)
#if CUDNN_VERSION_MIN(8, 1, 0)
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
#endif
#endif
// MIOPEN do not support double
// MIOPEN do not support double
#ifndef PADDLE_WITH_HIP
#ifndef PADDLE_WITH_HIP
template
class
SoftmaxCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
#endif
#endif
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
true
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
bfloat16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
bfloat16
,
false
>;
...
...
paddle/fluid/operators/math/vol2col.cu
浏览文件 @
88490567
...
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
...
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
}
}
// };
// };
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
浏览文件 @
88490567
...
@@ -16,12 +16,6 @@
...
@@ -16,12 +16,6 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
sequence_concat
,
sequence_concat
,
paddle
::
operators
::
SeqConcatKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
operators
::
SeqConcatKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
88490567
...
@@ -51,7 +51,6 @@ namespace platform {
...
@@ -51,7 +51,6 @@ namespace platform {
//
//
// The NCCLComm instance is created and reversed in the NCCLCommContext
// The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id.
// singleton with a global user specified group id.
class
CUDADeviceContext
;
class
NCCLComm
{
class
NCCLComm
{
public:
public:
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
88490567
...
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
...
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_
=
memory
::
Alloc
(
device_context_
,
required_workspace_bytes
);
allocation_
=
memory
::
Alloc
(
device_context_
,
required_workspace_bytes
);
}
}
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
phi
::
GPUContext
(
place
)
{}
CUDADeviceContext
::~
CUDADeviceContext
()
=
default
;
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
88490567
...
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
...
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
class
CudnnWorkspaceHandle
;
class
CudnnWorkspaceHandle
;
class
EigenCudaStreamDevice
;
class
EigenCudaStreamDevice
;
class
CUDADeviceContext
:
public
phi
::
GPUContext
{
using
CUDADeviceContext
=
phi
::
GPUContext
;
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
virtual
~
CUDADeviceContext
();
private:
int
place_holder_
;
// TO BE REMOVED
DISABLE_COPY_AND_ASSIGN
(
CUDADeviceContext
);
};
class
CudnnWorkspaceHandle
{
class
CudnnWorkspaceHandle
{
public:
public:
...
...
paddle/fluid/platform/transform.h
浏览文件 @
88490567
...
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
...
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
};
};
#if defined(__NVCC__) || defined(__HIPCC__)
#if defined(__NVCC__) || defined(__HIPCC__)
template
<
>
struct
Transform
<
platform
::
CUDADeviceContext
>
{
template
<
typename
InputIter
,
typename
OutputIter
,
typename
UnaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter
first
,
InputIter
last
,
OutputIter
result
,
UnaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
template
<
typename
InputIter1
,
typename
InputIter2
,
typename
OutputIter
,
typename
BinaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter1
first1
,
InputIter1
last1
,
InputIter2
first2
,
OutputIter
result
,
BinaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
};
template
<
>
template
<
>
struct
Transform
<
phi
::
GPUContext
>
{
struct
Transform
<
phi
::
GPUContext
>
{
...
...
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
浏览文件 @
88490567
...
@@ -66,58 +66,6 @@ struct CUBlas<float> {
...
@@ -66,58 +66,6 @@ struct CUBlas<float> {
#endif
#endif
}
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
float
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
)
{
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
#if CUDA_VERSION >= 8000
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
dev_ctx
->
tensor_core_available
()
?
"True"
:
"False"
);
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasSgemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasSgemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
...
@@ -366,66 +314,6 @@ struct CUBlas<phi::dtype::float16> {
...
@@ -366,66 +314,6 @@ struct CUBlas<phi::dtype::float16> {
#endif
#endif
}
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
...
@@ -636,66 +524,6 @@ struct CUBlas<phi::dtype::complex<float>> {
...
@@ -636,66 +524,6 @@ struct CUBlas<phi::dtype::complex<float>> {
ldb
));
ldb
));
}
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
...
@@ -965,66 +793,6 @@ struct CUBlas<phi::dtype::complex<double>> {
...
@@ -965,66 +793,6 @@ struct CUBlas<phi::dtype::complex<double>> {
batch_size
));
batch_size
));
}
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
#else
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx is not supported on cuda <= 7.5"
));
#endif
}
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
...
@@ -1088,7 +856,7 @@ struct CUBlas<phi::dtype::complex<double>> {
...
@@ -1088,7 +856,7 @@ struct CUBlas<phi::dtype::complex<double>> {
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
p
addle
::
platform
::
CUDADevice
Context
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
void
Blas
<
p
hi
::
GPU
Context
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
M
,
int
N
,
int
N
,
...
@@ -1109,8 +877,7 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1109,8 +877,7 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
#if CUDA_VERSION >= 8000
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransB
,
cuTransA
,
cuTransA
,
...
@@ -1153,17 +920,17 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1153,17 +920,17 @@ void Blas<paddle::platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
}
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
M
,
int
N
,
int
N
,
int
K
,
int
K
,
T
alpha
,
phi
::
dtype
::
float16
alpha
,
const
T
*
A
,
const
phi
::
dtype
::
float16
*
A
,
const
T
*
B
,
const
phi
::
dtype
::
float16
*
B
,
T
beta
,
phi
::
dtype
::
float16
beta
,
T
*
C
)
const
{
phi
::
dtype
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
@@ -1172,163 +939,18 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1172,163 +939,18 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
#if CUDA_VERSION >= 8000
// TODO(kexinzhao): add processing code for compute capability < 53 case
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
PADDLE_ENFORCE_GE
(
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
context_
.
GetComputeCapability
(),
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
53
,
cuTransB
,
phi
::
errors
::
InvalidArgument
(
cuTransA
,
"cublas fp16 gemm requires GPU compute capability >= 53,"
N
,
"but received %d"
,
M
,
context_
.
GetComputeCapability
()));
K
,
&
alpha
,
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
B
,
float
h_beta
=
static_cast
<
float
>
(
beta
);
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
N
);
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16F
,
ldb
,
A
,
CUDA_R_16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
CUDA_R_32F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
h_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
#if CUDA_VERSION >= 8000
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// cublasHgemm does true FP16 computation which is slow for non-Volta
...
@@ -1376,77 +998,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1376,77 +998,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 8000
#endif // CUDA_VERSION >= 8000
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
#if CUDA_VERSION >= 11000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
80
,
phi
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 80,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16BF
,
ldb
,
A
,
CUDA_R_16BF
,
lda
,
&
h_beta
,
C
,
CUDA_R_16BF
,
N
,
CUDA_R_32F
,
algo
));
});
#else
// raise error
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmEx with bfloat16 is not supported on cuda <= 11"
));
#endif // CUDA_VERSION >= 11000
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -1519,8 +1070,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1519,8 +1070,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
M
,
int
N
,
int
N
,
...
@@ -1557,7 +1107,7 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
...
@@ -1557,7 +1107,7 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
p
addle
::
platform
::
CUDADevice
Context
&>
(
context_
);
auto
&
cuda_ctx
=
const_cast
<
p
hi
::
GPU
Context
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransB
,
cuTransA
,
cuTransA
,
...
@@ -1605,11 +1155,11 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1605,11 +1155,11 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
int
M
,
int
M
,
int
N
,
int
N
,
int
K
,
int
K
,
phi
::
dtype
::
complex
<
float
>
alpha
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
@@ -1624,13 +1174,14 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1624,13 +1174,14 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
context_
.
GetComputeCapability
(),
context_
.
GetComputeCapability
(),
53
,
53
,
phi
::
errors
::
InvalidArgument
(
phi
::
errors
::
InvalidArgument
(
"cublas complex
64
gemm requires GPU compute capability >= 53,"
"cublas complex
128
gemm requires GPU compute capability >= 53,"
"but received %d"
,
"but received %d"
,
context_
.
GetComputeCapability
()));
context_
.
GetComputeCapability
()));
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
#if CUDA_VERSION >= 8000
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// cublasHgemm does true FP16 computation which is slow for non-Volta
...
@@ -1638,7 +1189,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1638,7 +1189,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
// input/output in fp16, computation in fp32, which can also be accelerated
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransB
,
cuTransA
,
cuTransA
,
N
,
N
,
...
@@ -1646,98 +1197,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1646,98 +1197,16 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
K
,
K
,
&
c_alpha
,
&
c_alpha
,
B
,
B
,
CUDA_C_32
F
,
CUDA_C_64
F
,
ldb
,
ldb
,
A
,
A
,
CUDA_C_32
F
,
CUDA_C_64
F
,
lda
,
lda
,
&
c_beta
,
&
c_beta
,
C
,
C
,
CUDA_C_32
F
,
CUDA_C_64
F
,
N
,
N
,
CUDA_C_32F
);
CUDA_C_64F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
c_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
CUDA_C_64F
,
ldb
,
A
,
CUDA_C_64F
,
lda
,
&
c_beta
,
C
,
CUDA_C_64F
,
N
,
CUDA_C_64F
);
#else
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
...
@@ -1760,153 +1229,6 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
...
@@ -1760,153 +1229,6 @@ inline void Blas<paddle::platform::CUDADeviceContext>::GEMM(
#endif // CUDA_VERSION >= 8000
#endif // CUDA_VERSION >= 8000
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
#if CUDA_VERSION >= 8000
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
CUDA_C_64F
,
ldb
,
A
,
CUDA_C_64F
,
lda
,
&
c_beta
,
C
,
CUDA_C_64F
,
N
,
CUDA_C_64F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
c_beta
,
h_C
,
N
);
});
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
#if CUDA_VERSION >= 8000
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
T
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
A
,
CUDA_R_32F
,
lda
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
);
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
#if CUDA_VERSION >= 8000
}
#endif // CUDA_VERSION >= 8000
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
@@ -1972,45 +1294,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
...
@@ -1972,45 +1294,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
#endif // CUDA_VERSION >= 8000
#endif // CUDA_VERSION >= 8000
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
int
lda
,
const
phi
::
dtype
::
float16
*
B
,
int
ldb
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
@@ -2049,17 +1332,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
...
@@ -2049,17 +1332,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
void
Blas
<
phi
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
...
@@ -2068,15 +1340,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
...
@@ -2068,15 +1340,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
void
Blas
<
phi
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
...
@@ -2084,15 +1347,6 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
...
@@ -2084,15 +1347,6 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
void
Blas
<
phi
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
...
@@ -2100,23 +1354,6 @@ void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
...
@@ -2100,23 +1354,6 @@ void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
[
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
@@ -2134,27 +1371,6 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
...
@@ -2134,27 +1371,6 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
});
});
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
@@ -2175,28 +1391,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
...
@@ -2175,28 +1391,6 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
}
}
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
// it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
@@ -2212,121 +1406,10 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
...
@@ -2212,121 +1406,10 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
if
(
trans_a
)
{
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
#if CUDA_VERSION >= 9010
if
((
FLAGS_enable_cublas_tensor_op_math
&&
(
std
::
is_same
<
T
,
float
>::
value
))
||
std
::
is_same
<
T
,
phi
::
dtype
::
float16
>::
value
)
{
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
VLOG
(
4
)
<<
"use_half_precision_compute_type: "
<<
FLAGS_gemm_use_half_precision_compute_type
;
auto
fp
=
std
::
is_same
<
T
,
float
>::
value
?
CUDA_R_32F
:
CUDA_R_16F
;
cudaDataType_t
compute_type
=
fp
;
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
void
*
a
=
static_cast
<
void
*>
(
&
h_alpha
);
void
*
b
=
static_cast
<
void
*>
(
&
h_beta
);
// set ComputeType as CUDA_R_32F for fp16, for better accuracy
if
(
FLAGS_gemm_use_half_precision_compute_type
==
true
&&
std
::
is_same
<
T
,
phi
::
dtype
::
float16
>::
value
)
{
a
=
static_cast
<
void
*>
(
&
alpha
);
b
=
static_cast
<
void
*>
(
&
beta
);
compute_type
=
CUDA_R_16F
;
}
// set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
a
,
B
,
fp
,
ldb
,
strideB
,
A
,
fp
,
lda
,
strideA
,
b
,
C
,
fp
,
ldc
,
strideC
,
batchCount
,
compute_type
,
algo
));
});
}
else
{
#endif // CUDA_VERSION >= 9010
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
});
#if CUDA_VERSION >= 9010
}
}
#endif // CUDA_VERSION >= 9010
}
}
template
<
>
template
<
>
...
@@ -2438,78 +1521,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -2438,78 +1521,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 9010
#endif // CUDA_VERSION >= 9010
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
#if CUDA_VERSION >= 11000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasOperation_t
cuTransB
=
(
transB
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
const
int64_t
strideC
=
M
*
N
;
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_available
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
CUDA_R_16BF
,
ldb
,
strideB
,
A
,
CUDA_R_16BF
,
lda
,
strideA
,
&
h_beta
,
C
,
CUDA_R_16BF
,
ldc
,
strideC
,
batchCount
,
CUBLAS_COMPUTE_32F
,
algo
));
});
#else
// raise error
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
"11"
));
#endif // CUDA_VERSION >= 11000
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -2582,26 +1593,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -2582,26 +1593,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 11000
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
**
A
,
const
T
**
B
,
T
beta
,
T
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -2621,26 +1612,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -2621,26 +1612,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
**
A
,
const
phi
::
dtype
::
float16
**
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -2660,26 +1631,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -2660,26 +1631,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
**
A
,
const
phi
::
dtype
::
bfloat16
**
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -2699,37 +1650,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -2699,37 +1650,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
int
lda
,
T
*
B
,
int
ldb
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t
cuSide
=
(
side
==
CblasLeft
)
?
CUBLAS_SIDE_RIGHT
:
CUBLAS_SIDE_LEFT
;
cublasFillMode_t
cuUplo
=
(
uplo
==
CblasLower
)
?
CUBLAS_FILL_MODE_UPPER
:
CUBLAS_FILL_MODE_LOWER
;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasDiagType_t
cuDiag
=
(
diag
==
CblasUnit
)
?
CUBLAS_DIAG_UNIT
:
CUBLAS_DIAG_NON_UNIT
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
TRSM
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
void
Blas
<
phi
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
...
@@ -2761,15 +1681,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
...
@@ -2761,15 +1681,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRF
(
int
n
,
T
**
a
,
int
*
ipiv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRF_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRF
(
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRF
(
...
@@ -2779,25 +1690,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
...
@@ -2779,25 +1690,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRI
(
int
n
,
const
T
**
a
,
const
int
*
ipiv
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
PADDLE_ENFORCE_NE
(
a_inv
,
a
,
phi
::
errors
::
InvalidArgument
(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p)."
,
a_inv
,
a
));
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRI_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRI
(
int
n
,
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRI
(
int
n
,
...
@@ -2820,15 +1712,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
...
@@ -2820,15 +1712,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedMatInv
(
int
n
,
const
T
**
a
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
MATINV_BATCH
(
handle
,
n
,
a
,
n
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedMatInv
(
void
Blas
<
phi
::
GPUContext
>::
BatchedMatInv
(
...
@@ -2838,28 +1721,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
...
@@ -2838,28 +1721,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
int
n
,
int
nrhs
,
const
T
**
a
,
int
lda
,
int
*
ipiv
,
T
**
b
,
int
ldb
,
int
*
info
,
int
batch_size
)
const
{
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTrans
=
(
trans
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GETRS_BATCH
(
handle
,
cuTrans
,
n
,
nrhs
,
a
,
lda
,
ipiv
,
b
,
ldb
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
...
@@ -2881,50 +1742,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
...
@@ -2881,50 +1742,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
**
A
,
int
lda
,
T
**
B
,
int
ldb
,
int
batch_size
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
cublasSideMode_t
cuSide
=
(
side
==
CblasLeft
)
?
CUBLAS_SIDE_RIGHT
:
CUBLAS_SIDE_LEFT
;
cublasFillMode_t
cuUplo
=
(
uplo
==
CblasLower
)
?
CUBLAS_FILL_MODE_UPPER
:
CUBLAS_FILL_MODE_LOWER
;
// use CUBLAS_OP_C (conjugate transpose) for complex
cublasOperation_t
cuTransA
=
(
transA
==
CblasNoTrans
)
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
cublasDiagType_t
cuDiag
=
(
diag
==
CblasUnit
)
?
CUBLAS_DIAG_UNIT
:
CUBLAS_DIAG_NON_UNIT
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
TRSM_BATCH
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
void
Blas
<
phi
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
...
...
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
浏览文件 @
88490567
...
@@ -257,54 +257,6 @@ struct CUBlas<phi::dtype::float16> {
...
@@ -257,54 +257,6 @@ struct CUBlas<phi::dtype::float16> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transa
,
rocblas_operation
transb
,
rocblas_operation
transb
,
...
@@ -474,54 +426,6 @@ struct CUBlas<phi::dtype::complex<float>> {
...
@@ -474,54 +426,6 @@ struct CUBlas<phi::dtype::complex<float>> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transa
,
rocblas_operation
transb
,
rocblas_operation
transb
,
...
@@ -692,54 +596,6 @@ struct CUBlas<phi::dtype::complex<double>> {
...
@@ -692,54 +596,6 @@ struct CUBlas<phi::dtype::complex<double>> {
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
// https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
void
*
alpha
,
const
void
*
A
,
rocblas_datatype
Atype
,
int
lda
,
const
void
*
B
,
rocblas_datatype
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
rocblas_datatype
Ctype
,
int
ldc
,
rocblas_datatype
computeType
)
{
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
C
,
Ctype
,
ldc
,
computeType
,
algo
,
0
,
0
));
});
}
template
<
typename
...
ARGS
>
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
static
void
GEMM_EX
(
phi
::
GPUContext
*
dev_ctx
,
rocblas_operation
transa
,
rocblas_operation
transa
,
rocblas_operation
transb
,
rocblas_operation
transb
,
...
@@ -789,45 +645,6 @@ struct CUBlas<phi::dtype::complex<double>> {
...
@@ -789,45 +645,6 @@ struct CUBlas<phi::dtype::complex<double>> {
}
}
};
};
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -868,62 +685,6 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -868,62 +685,6 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
});
});
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas fp16 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
rocblas_datatype_f16_r
,
ldb
,
A
,
rocblas_datatype_f16_r
,
lda
,
&
h_beta
,
C
,
rocblas_datatype_f16_r
,
N
,
rocblas_datatype_f32_r
);
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -982,8 +743,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -982,8 +743,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
M
,
int
N
,
int
N
,
...
@@ -1052,11 +812,11 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1052,11 +812,11 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
int
M
,
int
M
,
int
N
,
int
N
,
int
K
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
phi
::
dtype
::
complex
<
float
>
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
...
@@ -1067,134 +827,13 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1067,134 +827,13 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
?
rocblas_operation_none
:
rocblas_operation_transpose
;
:
rocblas_operation_transpose
;
// TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
context_
.
GetComputeCapability
(),
80
,
53
,
phi
::
errors
::
InvalidArgument
(
phi
::
errors
::
InvalidArgument
(
"rocblas fp16 gemm requires GPU compute capability >= 80,"
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_ex
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
rocblas_datatype_bf16_r
,
ldb
,
A
,
rocblas_datatype_bf16_r
,
lda
,
&
h_beta
,
C
,
rocblas_datatype_bf16_r
,
N
,
C
,
rocblas_datatype_bf16_r
,
N
,
rocblas_datatype_f32_r
,
algo
,
0
,
0
));
});
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
float
>
alpha
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
float
>
c_alpha
=
thrust
::
complex
<
float
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
float
>
c_beta
=
thrust
::
complex
<
float
>
(
beta
.
real
,
beta
.
imag
);
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
float
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
rocblas_datatype_f32_c
,
ldb
,
A
,
rocblas_datatype_f32_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f32_c
,
N
,
rocblas_datatype_f32_c
);
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
float
>
alpha
,
const
phi
::
dtype
::
complex
<
float
>
*
A
,
const
phi
::
dtype
::
complex
<
float
>
*
B
,
phi
::
dtype
::
complex
<
float
>
beta
,
phi
::
dtype
::
complex
<
float
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex64 gemm requires GPU compute capability >= 53,"
"but received %d"
,
"but received %d"
,
context_
.
GetComputeCapability
()));
context_
.
GetComputeCapability
()));
...
@@ -1223,64 +862,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1223,64 +862,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_datatype_f32_c
);
rocblas_datatype_f32_c
);
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
complex
<
double
>
alpha
,
const
phi
::
dtype
::
complex
<
double
>
*
A
,
const
phi
::
dtype
::
complex
<
double
>
*
B
,
phi
::
dtype
::
complex
<
double
>
beta
,
phi
::
dtype
::
complex
<
double
>
*
C
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE
(
context_
.
GetComputeCapability
(),
53
,
phi
::
errors
::
InvalidArgument
(
"cublas complex128 gemm requires GPU compute capability >= 53,"
"but received %d"
,
context_
.
GetComputeCapability
()));
thrust
::
complex
<
double
>
c_alpha
=
thrust
::
complex
<
double
>
(
alpha
.
real
,
alpha
.
imag
);
thrust
::
complex
<
double
>
c_beta
=
thrust
::
complex
<
double
>
(
beta
.
real
,
beta
.
imag
);
auto
&
cuda_ctx
=
const_cast
<
paddle
::
platform
::
CUDADeviceContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
complex
<
double
>>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
c_alpha
,
B
,
rocblas_datatype_f64_c
,
ldb
,
A
,
rocblas_datatype_f64_c
,
lda
,
&
c_beta
,
C
,
rocblas_datatype_f64_c
,
N
,
rocblas_datatype_f64_c
);
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -1339,44 +920,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
...
@@ -1339,44 +920,6 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
rocblas_datatype_f64_c
);
rocblas_datatype_f64_c
);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation
cuTransA
=
transA
?
rocblas_operation_transpose
:
rocblas_operation_none
;
rocblas_operation
cuTransB
=
transB
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
@@ -1416,46 +959,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
...
@@ -1416,46 +959,6 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
});
});
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
int
lda
,
const
phi
::
dtype
::
float16
*
B
,
int
ldb
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
,
int
ldc
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation
cuTransA
=
transA
?
rocblas_operation_transpose
:
rocblas_operation_none
;
rocblas_operation
cuTransB
=
transB
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMM
(
bool
transA
,
...
@@ -1496,16 +999,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
...
@@ -1496,16 +999,6 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
void
Blas
<
phi
::
GPUContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
...
@@ -1514,14 +1007,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
...
@@ -1514,14 +1007,6 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
void
Blas
<
phi
::
GPUContext
>::
SCAL
(
int
n
,
const
T
alpha
,
T
*
x
)
const
{
...
@@ -1529,14 +1014,6 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
...
@@ -1529,14 +1014,6 @@ void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
SCAL
(
handle
,
n
,
&
alpha
,
x
,
1
);
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
(
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
void
Blas
<
phi
::
GPUContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
...
@@ -1544,23 +1021,6 @@ void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
...
@@ -1544,23 +1021,6 @@ void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
[
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
VCOPY
(
handle
,
n
,
x
,
1
,
y
,
1
);
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
{
rocblas_operation
cuTransA
=
!
trans_a
?
rocblas_operation_transpose
:
rocblas_operation_none
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
@@ -1579,26 +1039,6 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
...
@@ -1579,26 +1039,6 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
});
});
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
*
A
,
const
phi
::
dtype
::
float16
*
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
...
@@ -1621,92 +1061,22 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
...
@@ -1621,92 +1061,22 @@ inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
GEMV
(
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
bool
trans_a
,
int
M
,
int
M
,
int
N
,
int
N
,
phi
::
dtype
::
bfloat16
alpha
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
// Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
if
(
trans_a
)
{
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
}
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
)
const
{
// Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
const
int64_t
strideC
=
M
*
N
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
});
}
}
template
<
>
template
<
>
...
@@ -1758,71 +1128,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -1758,71 +1128,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
});
});
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
*
A
,
const
phi
::
dtype
::
bfloat16
*
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
*
C
,
int
batchCount
,
int64_t
strideA
,
int64_t
strideB
)
const
{
int
lda
=
(
transA
==
CblasNoTrans
)
?
K
:
M
;
int
ldb
=
(
transB
==
CblasNoTrans
)
?
N
:
K
;
int
ldc
=
N
;
const
int64_t
strideC
=
M
*
N
;
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_operation
cuTransB
=
(
transB
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
;
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
rocblas_handle
handle
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
rocblas_gemm_strided_batched_ex
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
B
,
rocblas_datatype_bf16_r
,
ldb
,
strideB
,
A
,
rocblas_datatype_bf16_r
,
lda
,
strideA
,
&
h_beta
,
C
,
rocblas_datatype_bf16_r
,
ldc
,
strideC
,
C
,
rocblas_datatype_bf16_r
,
ldc
,
strideC
,
batchCount
,
rocblas_datatype_f32_r
,
algo
,
0
,
0
));
});
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -1887,26 +1192,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -1887,26 +1192,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
**
A
,
const
T
**
B
,
T
beta
,
T
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
T
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -1926,25 +1211,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -1926,25 +1211,6 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
float16
alpha
,
const
phi
::
dtype
::
float16
**
A
,
const
phi
::
dtype
::
float16
**
B
,
phi
::
dtype
::
float16
beta
,
phi
::
dtype
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -1964,26 +1230,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -1964,26 +1230,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
}
}
template
<
>
template
<
>
inline
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
phi
::
dtype
::
bfloat16
alpha
,
const
phi
::
dtype
::
bfloat16
**
A
,
const
phi
::
dtype
::
bfloat16
**
B
,
phi
::
dtype
::
bfloat16
beta
,
phi
::
dtype
::
bfloat16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
phi
::
dtype
::
bfloat16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
template
<
>
template
<
>
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
inline
void
Blas
<
phi
::
GPUContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
...
@@ -2003,37 +1249,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
...
@@ -2003,37 +1249,6 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
}
}
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
int
lda
,
T
*
B
,
int
ldb
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side
cuSide
=
(
side
==
CblasLeft
)
?
rocblas_side_right
:
rocblas_side_left
;
rocblas_fill
cuUplo
=
(
uplo
==
CblasLower
)
?
rocblas_fill_upper
:
rocblas_fill_lower
;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_diagonal
cuDiag
=
(
diag
==
CblasUnit
)
?
rocblas_diagonal_unit
:
rocblas_diagonal_non_unit
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
TRSM
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
void
Blas
<
phi
::
GPUContext
>::
TRSM
(
CBLAS_SIDE
side
,
...
@@ -2066,14 +1281,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
...
@@ -2066,14 +1281,6 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRF
(
int
n
,
T
**
a
,
int
*
ipiv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRF_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRF
(
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRF
(
...
@@ -2083,24 +1290,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
...
@@ -2083,24 +1290,6 @@ void Blas<phi::GPUContext>::BatchedGETRF(
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRI
(
int
n
,
const
T
**
a
,
const
int
*
ipiv
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
PADDLE_ENFORCE_NE
(
a_inv
,
a
,
phi
::
errors
::
InvalidArgument
(
"cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
"in-place. The memory space of output matrix (address: %p) cannot "
"overlap memory space of input matrix (address: %p)."
,
a_inv
,
a
));
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRI_BATCH
(
handle
,
n
,
a
,
n
,
ipiv
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRI
(
int
n
,
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRI
(
int
n
,
...
@@ -2123,14 +1312,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
...
@@ -2123,14 +1312,6 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedMatInv
(
int
n
,
const
T
**
a
,
T
**
a_inv
,
int
*
info
,
int
batch_size
)
const
{
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
MATINV_BATCH
(
handle
,
n
,
a
,
n
,
a_inv
,
n
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedMatInv
(
void
Blas
<
phi
::
GPUContext
>::
BatchedMatInv
(
...
@@ -2140,27 +1321,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
...
@@ -2140,27 +1321,6 @@ void Blas<phi::GPUContext>::BatchedMatInv(
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
int
n
,
int
nrhs
,
const
T
**
a
,
int
lda
,
int
*
ipiv
,
T
**
b
,
int
ldb
,
int
*
info
,
int
batch_size
)
const
{
rocblas_operation
cuTrans
=
(
trans
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
GETRS_BATCH
(
handle
,
cuTrans
,
n
,
nrhs
,
a
,
lda
,
ipiv
,
b
,
ldb
,
info
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
void
Blas
<
phi
::
GPUContext
>::
BatchedGETRS
(
CBLAS_TRANSPOSE
trans
,
...
@@ -2182,50 +1342,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
...
@@ -2182,50 +1342,6 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
});
});
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
paddle
::
platform
::
CUDADeviceContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
CBLAS_TRANSPOSE
transA
,
CBLAS_DIAG
diag
,
int
M
,
int
N
,
T
alpha
,
const
T
**
A
,
int
lda
,
T
**
B
,
int
ldb
,
int
batch_size
)
const
{
// solve row major `op ( A ) X = α B` by taking it as `X' op ( A' ) = α B'`
// where ' stands for transpose
rocblas_side
cuSide
=
(
side
==
CblasLeft
)
?
rocblas_side_right
:
rocblas_side_left
;
rocblas_fill
cuUplo
=
(
uplo
==
CblasLower
)
?
rocblas_fill_upper
:
rocblas_fill_lower
;
// use CUBLAS_OP_C (conjugate transpose) for complex
rocblas_operation
cuTransA
=
(
transA
==
CblasNoTrans
)
?
rocblas_operation_none
:
rocblas_operation_transpose
;
rocblas_diagonal
cuDiag
=
(
diag
==
CblasUnit
)
?
rocblas_diagonal_unit
:
rocblas_diagonal_non_unit
;
context_
.
CublasCall
([
&
](
rocblas_handle
handle
)
{
CUBlas
<
T
>::
TRSM_BATCH
(
handle
,
cuSide
,
cuUplo
,
cuTransA
,
cuDiag
,
N
,
M
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
batch_size
);
});
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
phi
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
void
Blas
<
phi
::
GPUContext
>::
BatchedTRSM
(
CBLAS_SIDE
side
,
...
...
paddle/phi/kernels/funcs/fc_functor.cu
浏览文件 @
88490567
...
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
...
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
AddReluKernel
(
context
.
stream
(),
M
,
N
,
Y
,
B
,
relu
);
AddReluKernel
(
context
.
stream
(),
M
,
N
,
Y
,
B
,
relu
);
}
}
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float16
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
class
FCFunctor
<
GPUContext
,
float16
>;
template
class
FCFunctor
<
GPUContext
,
float16
>;
template
class
FCFunctor
<
GPUContext
,
float
>;
template
class
FCFunctor
<
GPUContext
,
float
>;
template
class
FCFunctor
<
GPUContext
,
double
>;
template
class
FCFunctor
<
GPUContext
,
double
>;
...
...
paddle/phi/kernels/funcs/for_range.h
浏览文件 @
88490567
...
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
...
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
size_t
limit_
;
size_t
limit_
;
};
};
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template
<
>
struct
ForRange
<
paddle
::
platform
::
CUDADeviceContext
>
{
ForRange
(
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx
,
size_t
limit
)
:
dev_ctx_
(
dev_ctx
),
limit_
(
limit
)
{}
template
<
typename
Function
>
inline
void
operator
()(
Function
func
)
const
{
phi
::
funcs
::
ForRange
<
phi
::
GPUContext
>
for_range
(
dev_ctx_
,
limit_
);
for_range
(
func
);
}
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx_
;
size_t
limit_
;
};
#endif
#endif
}
// namespace funcs
}
// namespace funcs
...
...
paddle/phi/kernels/funcs/math_function.cu
浏览文件 @
88490567
...
@@ -31,22 +31,6 @@ namespace funcs {
...
@@ -31,22 +31,6 @@ namespace funcs {
using
float16
=
phi
::
dtype
::
float16
;
using
float16
=
phi
::
dtype
::
float16
;
using
bfloat16
=
phi
::
dtype
::
bfloat16
;
using
bfloat16
=
phi
::
dtype
::
bfloat16
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int16_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
bool
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
float
>
>
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
double
>
>
;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
float
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
float
>;
...
@@ -76,32 +60,6 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
...
@@ -76,32 +60,6 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
phi
::
dtype
::
complex
<
double
>
>
;
phi
::
dtype
::
complex
<
double
>
>
;
#define DEFINE_GPU_TRANS(RANK) \
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
double, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
float16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
bfloat16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int8_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int32_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int64_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<double>, \
RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
...
@@ -241,7 +199,6 @@ struct TransposeNormal<phi::GPUContext, T> {
...
@@ -241,7 +199,6 @@ struct TransposeNormal<phi::GPUContext, T> {
// define transpose normal
// define transpose normal
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
template struct TransposeNormal<phi::GPUContext, TYPE>
template struct TransposeNormal<phi::GPUContext, TYPE>
DEFINE_GPU_TRANS_NORMAL
(
float16
);
DEFINE_GPU_TRANS_NORMAL
(
float16
);
...
...
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
浏览文件 @
88490567
...
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
...
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
template
class
MatrixInverseFunctor
<
GPUContext
,
float
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
float
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
double
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
double
>;
// TODO(chenweihang): remove these instantiations later
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace funcs
}
// namespace phi
}
// namespace phi
paddle/phi/kernels/funcs/matrix_solve.cu
浏览文件 @
88490567
...
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
...
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
template
class
MatrixSolveFunctor
<
GPUContext
,
float
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
float
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
double
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
double
>;
// TODO(wuweilong): remove these instantiations later
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace funcs
}
// namespace phi
}
// namespace phi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录