Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
88490567
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
88490567
编写于
7月 29, 2022
作者:
L
Leo Chen
提交者:
GitHub
7月 29, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
* remove cudaDeviceContext * remove more template * fix rocm compile
上级
0a2db7c8
变更
26
展开全部
隐藏空白更改
内联
并排
Showing
26 changed file
with
122 addition
and
2801 deletion
+122
-2801
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+0
-6
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
...e/fluid/memory/allocation/cuda_device_context_allocator.h
+0
-5
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+0
-7
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+4
-3
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+0
-7
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+0
-5
paddle/fluid/operators/math/im2col.cu
paddle/fluid/operators/math/im2col.cu
+0
-24
paddle/fluid/operators/math/maxouting.cu
paddle/fluid/operators/math/maxouting.cu
+0
-6
paddle/fluid/operators/math/sample_prob.h
paddle/fluid/operators/math/sample_prob.h
+0
-6
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+2
-159
paddle/fluid/operators/math/sequence_padding.cu
paddle/fluid/operators/math/sequence_padding.cu
+0
-159
paddle/fluid/operators/math/sequence_scale.cu
paddle/fluid/operators/math/sequence_scale.cu
+0
-40
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+0
-35
paddle/fluid/operators/math/vol2col.cu
paddle/fluid/operators/math/vol2col.cu
+0
-4
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+0
-6
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+0
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+0
-5
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+1
-9
paddle/fluid/platform/transform.h
paddle/fluid/platform/transform.h
+0
-60
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+80
-1263
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+22
-906
paddle/phi/kernels/funcs/fc_functor.cu
paddle/phi/kernels/funcs/fc_functor.cu
+0
-4
paddle/phi/kernels/funcs/for_range.h
paddle/phi/kernels/funcs/for_range.h
+0
-16
paddle/phi/kernels/funcs/math_function.cu
paddle/phi/kernels/funcs/math_function.cu
+13
-56
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+0
-5
paddle/phi/kernels/funcs/matrix_solve.cu
paddle/phi/kernels/funcs/matrix_solve.cu
+0
-4
未找到文件。
paddle/fluid/framework/details/eager_deletion_op_handle.h
浏览文件 @
88490567
...
...
@@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
framework
{
class
GarbageCollector
;
...
...
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
浏览文件 @
88490567
...
...
@@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
namespace
memory
{
namespace
allocation
{
...
...
paddle/fluid/operators/cudnn_lstm_op.cu.cc
浏览文件 @
88490567
...
...
@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
浏览文件 @
88490567
...
...
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
// first sum pool
FusedSeqpoolKernelNormal
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
...
...
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
(
embedding_size
-
cvm_offset
));
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
FusedCVMKernelNoCVM
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
...
...
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
auto
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
auto
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
if
(
use_cvm
)
{
// join grad
FusedSeqpoolCVMGradKernelWithCVM
<<<
config
.
block_per_grid
.
x
,
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
88490567
...
...
@@ -14,13 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
88490567
...
...
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
float
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
double
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
platform
::
float16
>;
...
...
paddle/fluid/operators/math/im2col.cu
浏览文件 @
88490567
...
...
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
float
>;
...
...
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
float
>;
...
...
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi
::
GPUContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
float
>;
...
...
paddle/fluid/operators/math/maxouting.cu
浏览文件 @
88490567
...
...
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis
);
}
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/sample_prob.h
浏览文件 @
88490567
...
...
@@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
88490567
...
...
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
const
framework
::
Tensor
&
input2
,
framework
::
Tensor
*
output
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
.
dims
();
auto
out_dims
=
output
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
in1_height
,
out_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]"
,
in1_height
,
out_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
.
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
.
numel
()
/
in1_height
));
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
output
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]"
,
in1_row_numel
,
output
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
.
data
<
T
>
();
auto
*
out_data
=
output
->
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
static_cast
<
T
>
(
0
));
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
out_data
,
in1_row_numel
);
auto
out_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
output
);
auto
in2_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
input2
);
out_eigen
.
device
(
*
context
.
eigen_device
())
=
out_eigen
+
in2_eigen
;
}
};
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
...
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
};
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAdd
<
phi
::
GPUContext
,
platform
::
float16
>;
...
...
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
framework
::
Tensor
*
input2
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
->
data
<
T
>
();
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddToTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
in2_data
,
in1_row_numel
);
}
};
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
...
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
};
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
int
>;
...
...
@@ -625,34 +498,6 @@ struct MergeAddImpl {
}
};
template
<
typename
T
>
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi
::
SelectedRows
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
const
bool
sorted_result
)
{
return
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
output
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
vector
<
const
phi
::
SelectedRows
*>&
inputs
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
inputs
,
output
,
sorted_result
);
}
};
template
<
typename
T
>
struct
MergeAdd
<
phi
::
GPUContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
...
...
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
}
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD
(
float
)
...
...
paddle/fluid/operators/math/sequence_padding.cu
浏览文件 @
88490567
...
...
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
}
}
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
seq_tensor
,
framework
::
LoDTensor
*
pad_tensor
,
const
framework
::
LoDTensor
&
pad_value
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_lod
=
seq_tensor
.
lod
();
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_lod
)[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
.
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
->
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
PADDLE_ENFORCE_GE
(
pad_seq_len
,
max_seq_len
,
platform
::
errors
::
InvalidArgument
(
"The pad_seq_len must be equal to or greater than the "
"original max sequence length. Expected %ld >= %ld, but got %ld < "
"%ld. Please check the input value."
,
pad_seq_len
,
max_seq_len
,
pad_seq_len
,
max_seq_len
));
int
step_width
=
seq_tensor
.
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
PADDLE_ENFORCE_EQ
(
pad_value
.
numel
()
==
1
||
pad_value
.
numel
()
==
step_width
,
true
,
platform
::
errors
::
InvalidArgument
(
"The numel of 'pad_value' can only be 1 or be equal to "
"the 'step_width', but got %ld != 1 and %ld. Please check the "
"input value."
,
pad_value
.
numel
(),
step_width
));
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
seq_data
=
seq_tensor
.
data
<
T
>
();
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kSeqToPad
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
pad_data
,
seq_data
,
pad_value_data
,
pad_value
.
numel
()
==
1
,
mix_vector_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
...
...
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
pad_tensor
,
framework
::
LoDTensor
*
seq_tensor
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_tensor
->
lod
())[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
->
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
.
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
int
step_width
=
seq_tensor
->
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
/*
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
return;
}
*/
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
pad_data
=
pad_tensor
.
data
<
T
>
();
T
*
seq_data
=
seq_tensor
->
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kPadToSeq
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
seq_data
,
pad_data
,
nullptr
,
false
,
mixv_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
...
...
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
}
};
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
...
...
paddle/fluid/operators/math/sequence_scale.cu
浏览文件 @
88490567
...
...
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
}
}
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
T
*
scales
,
framework
::
LoDTensor
*
seq
)
{
const
size_t
level
=
0
;
auto
lod
=
seq
->
lod
();
const
size_t
num_seq
=
lod
[
level
].
size
()
-
1
;
const
size_t
seq_width
=
seq
->
numel
()
/
seq
->
dims
()[
0
];
auto
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
T
*
seq_data
=
seq
->
mutable_data
<
T
>
(
context
.
GetPlace
());
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
),
dim3
(
num_seq
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
context
.
stream
(),
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#else
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
<<<
num_seq
,
PADDLE_CUDA_NUM_THREADS
,
0
,
context
.
stream
()
>>>
(
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#endif
mix_vector
.
CopyToCPU
();
}
};
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
...
...
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
}
};
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
88490567
...
...
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
#endif
}
template
class
SoftmaxCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
#if CUDNN_VERSION_MIN(8, 1, 0)
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
#endif
// MIOPEN do not support double
#ifndef PADDLE_WITH_HIP
template
class
SoftmaxCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
#endif
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
true
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
bfloat16
,
false
>;
...
...
paddle/fluid/operators/math/vol2col.cu
浏览文件 @
88490567
...
...
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
}
// };
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
浏览文件 @
88490567
...
...
@@ -16,12 +16,6 @@
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
sequence_concat
,
paddle
::
operators
::
SeqConcatKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
88490567
...
...
@@ -51,7 +51,6 @@ namespace platform {
//
// The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id.
class
CUDADeviceContext
;
class
NCCLComm
{
public:
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
88490567
...
...
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_
=
memory
::
Alloc
(
device_context_
,
required_workspace_bytes
);
}
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
phi
::
GPUContext
(
place
)
{}
CUDADeviceContext
::~
CUDADeviceContext
()
=
default
;
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
88490567
...
...
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
class
CudnnWorkspaceHandle
;
class
EigenCudaStreamDevice
;
class
CUDADeviceContext
:
public
phi
::
GPUContext
{
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
virtual
~
CUDADeviceContext
();
private:
int
place_holder_
;
// TO BE REMOVED
DISABLE_COPY_AND_ASSIGN
(
CUDADeviceContext
);
};
using
CUDADeviceContext
=
phi
::
GPUContext
;
class
CudnnWorkspaceHandle
{
public:
...
...
paddle/fluid/platform/transform.h
浏览文件 @
88490567
...
...
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
};
#if defined(__NVCC__) || defined(__HIPCC__)
template
<
>
struct
Transform
<
platform
::
CUDADeviceContext
>
{
template
<
typename
InputIter
,
typename
OutputIter
,
typename
UnaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter
first
,
InputIter
last
,
OutputIter
result
,
UnaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
template
<
typename
InputIter1
,
typename
InputIter2
,
typename
OutputIter
,
typename
BinaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter1
first1
,
InputIter1
last1
,
InputIter2
first2
,
OutputIter
result
,
BinaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
};
template
<
>
struct
Transform
<
phi
::
GPUContext
>
{
...
...
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
浏览文件 @
88490567
此差异已折叠。
点击以展开。
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
浏览文件 @
88490567
此差异已折叠。
点击以展开。
paddle/phi/kernels/funcs/fc_functor.cu
浏览文件 @
88490567
...
...
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
AddReluKernel
(
context
.
stream
(),
M
,
N
,
Y
,
B
,
relu
);
}
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float16
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
class
FCFunctor
<
GPUContext
,
float16
>;
template
class
FCFunctor
<
GPUContext
,
float
>;
template
class
FCFunctor
<
GPUContext
,
double
>;
...
...
paddle/phi/kernels/funcs/for_range.h
浏览文件 @
88490567
...
...
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
size_t
limit_
;
};
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template
<
>
struct
ForRange
<
paddle
::
platform
::
CUDADeviceContext
>
{
ForRange
(
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx
,
size_t
limit
)
:
dev_ctx_
(
dev_ctx
),
limit_
(
limit
)
{}
template
<
typename
Function
>
inline
void
operator
()(
Function
func
)
const
{
phi
::
funcs
::
ForRange
<
phi
::
GPUContext
>
for_range
(
dev_ctx_
,
limit_
);
for_range
(
func
);
}
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx_
;
size_t
limit_
;
};
#endif
}
// namespace funcs
...
...
paddle/phi/kernels/funcs/math_function.cu
浏览文件 @
88490567
...
...
@@ -31,22 +31,6 @@ namespace funcs {
using
float16
=
phi
::
dtype
::
float16
;
using
bfloat16
=
phi
::
dtype
::
bfloat16
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int16_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
bool
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
float
>
>
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
double
>
>
;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
float
>;
...
...
@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
template
struct
SetConstant
<
paddle
::
platform
::
CUDAPinnedDeviceContext
,
phi
::
dtype
::
complex
<
double
>
>
;
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
double, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
float16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
bfloat16, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int8_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int32_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int64_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<double>, \
RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
DEFINE_GPU_TRANS
(
1
);
...
...
@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
};
// define transpose normal
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<phi::GPUContext, TYPE>
DEFINE_GPU_TRANS_NORMAL
(
float16
);
...
...
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
浏览文件 @
88490567
...
...
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
template
class
MatrixInverseFunctor
<
GPUContext
,
float
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
double
>;
// TODO(chenweihang): remove these instantiations later
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/funcs/matrix_solve.cu
浏览文件 @
88490567
...
...
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
template
class
MatrixSolveFunctor
<
GPUContext
,
float
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
double
>;
// TODO(wuweilong): remove these instantiations later
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace phi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录