Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
88490567
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
88490567
编写于
7月 29, 2022
作者:
L
Leo Chen
提交者:
GitHub
7月 29, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
* remove cudaDeviceContext * remove more template * fix rocm compile
上级
0a2db7c8
变更
26
展开全部
隐藏空白更改
内联
并排
Showing
26 changed file
with
122 addition
and
2801 deletion
+122
-2801
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+0
-6
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
...e/fluid/memory/allocation/cuda_device_context_allocator.h
+0
-5
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+0
-7
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+4
-3
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+0
-7
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+0
-5
paddle/fluid/operators/math/im2col.cu
paddle/fluid/operators/math/im2col.cu
+0
-24
paddle/fluid/operators/math/maxouting.cu
paddle/fluid/operators/math/maxouting.cu
+0
-6
paddle/fluid/operators/math/sample_prob.h
paddle/fluid/operators/math/sample_prob.h
+0
-6
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+2
-159
paddle/fluid/operators/math/sequence_padding.cu
paddle/fluid/operators/math/sequence_padding.cu
+0
-159
paddle/fluid/operators/math/sequence_scale.cu
paddle/fluid/operators/math/sequence_scale.cu
+0
-40
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+0
-35
paddle/fluid/operators/math/vol2col.cu
paddle/fluid/operators/math/vol2col.cu
+0
-4
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+0
-6
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+0
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+0
-5
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+1
-9
paddle/fluid/platform/transform.h
paddle/fluid/platform/transform.h
+0
-60
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+80
-1263
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+22
-906
paddle/phi/kernels/funcs/fc_functor.cu
paddle/phi/kernels/funcs/fc_functor.cu
+0
-4
paddle/phi/kernels/funcs/for_range.h
paddle/phi/kernels/funcs/for_range.h
+0
-16
paddle/phi/kernels/funcs/math_function.cu
paddle/phi/kernels/funcs/math_function.cu
+13
-56
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+0
-5
paddle/phi/kernels/funcs/matrix_solve.cu
paddle/phi/kernels/funcs/matrix_solve.cu
+0
-4
未找到文件。
paddle/fluid/framework/details/eager_deletion_op_handle.h
浏览文件 @
88490567
...
@@ -23,12 +23,6 @@
...
@@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
class
GarbageCollector
;
class
GarbageCollector
;
...
...
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
浏览文件 @
88490567
...
@@ -25,11 +25,6 @@
...
@@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
namespace
memory
{
namespace
memory
{
namespace
allocation
{
namespace
allocation
{
...
...
paddle/fluid/operators/cudnn_lstm_op.cu.cc
浏览文件 @
88490567
...
@@ -23,13 +23,6 @@ limitations under the License. */
...
@@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif
#endif
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
浏览文件 @
88490567
...
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
...
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
// first sum pool
// first sum pool
FusedSeqpoolKernelNormal
<<<
config
.
block_per_grid
.
x
,
FusedSeqpoolKernelNormal
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
config
.
thread_per_block
.
x
,
...
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
...
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
// not need show click input
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
(
embedding_size
-
cvm_offset
));
(
embedding_size
-
cvm_offset
));
platform
::
GpuLaunchConfig
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
platform
::
GpuLaunchConfig
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
FusedCVMKernelNoCVM
<<<
config
.
block_per_grid
.
x
,
FusedCVMKernelNoCVM
<<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
config
.
thread_per_block
.
x
,
0
,
0
,
...
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
...
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif
#endif
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
size_t
N
=
static_cast
<
size_t
>
(
batch_size
*
slot_num
*
embedding_size
);
auto
config
=
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
auto
config
=
platform
::
GetGpuLaunchConfig1D
(
dev_ctx
,
N
);
if
(
use_cvm
)
{
if
(
use_cvm
)
{
// join grad
// join grad
FusedSeqpoolCVMGradKernelWithCVM
<<<
config
.
block_per_grid
.
x
,
FusedSeqpoolCVMGradKernelWithCVM
<<<
config
.
block_per_grid
.
x
,
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
88490567
...
@@ -14,13 +14,6 @@ limitations under the License. */
...
@@ -14,13 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
#include "paddle/fluid/operators/gru_op.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
88490567
...
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
...
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}
}
}
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
float
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
float
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
double
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
double
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
platform
::
float16
>;
template
class
CrossEntropyFunctor
<
phi
::
GPUContext
,
platform
::
float16
>;
...
...
paddle/fluid/operators/math/im2col.cu
浏览文件 @
88490567
...
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
...
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
}
};
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
phi
::
GPUContext
,
double
>;
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
...
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
...
@@ -576,12 +564,6 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
}
};
};
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
template
class
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
...
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
...
@@ -589,12 +571,6 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi
::
GPUContext
,
phi
::
GPUContext
,
double
>;
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
template
class
Col2ImFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kOCF
,
phi
::
GPUContext
,
phi
::
GPUContext
,
float
>;
float
>;
...
...
paddle/fluid/operators/math/maxouting.cu
浏览文件 @
88490567
...
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
...
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis
);
axis
);
}
}
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
MaxOutFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
double
>;
template
class
MaxOutGradFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/sample_prob.h
浏览文件 @
88490567
...
@@ -22,12 +22,6 @@ limitations under the License. */
...
@@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/ddim.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
88490567
...
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
...
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
}
}
// namespace
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
const
framework
::
Tensor
&
input2
,
framework
::
Tensor
*
output
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
.
dims
();
auto
out_dims
=
output
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
in1_height
,
out_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]"
,
in1_height
,
out_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
.
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
.
numel
()
/
in1_height
));
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
output
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]"
,
in1_row_numel
,
output
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
.
data
<
T
>
();
auto
*
out_data
=
output
->
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
static_cast
<
T
>
(
0
));
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
out_data
,
in1_row_numel
);
auto
out_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
output
);
auto
in2_eigen
=
framework
::
EigenVector
<
T
>::
Flatten
(
input2
);
out_eigen
.
device
(
*
context
.
eigen_device
())
=
out_eigen
+
in2_eigen
;
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
T
>
{
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
...
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
}
};
};
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAdd
<
phi
::
GPUContext
,
platform
::
float16
>;
template
struct
SelectedRowsAdd
<
phi
::
GPUContext
,
platform
::
float16
>;
...
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
...
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
}
}
// namespace
}
// namespace
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input1
,
framework
::
Tensor
*
input2
)
{
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
],
platform
::
errors
::
InvalidArgument
(
"The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]"
,
in1_height
,
in2_dims
[
0
]));
auto
&
in1_value
=
input1
.
value
();
auto
&
in1_rows
=
input1
.
rows
();
int64_t
in1_row_numel
=
in1_value
.
numel
()
/
in1_rows
.
size
();
PADDLE_ENFORCE_EQ
(
in1_row_numel
,
input2
->
numel
()
/
in1_height
,
platform
::
errors
::
InvalidArgument
(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]"
,
in1_row_numel
,
input2
->
numel
()
/
in1_height
));
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
input2
->
data
<
T
>
();
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
grid
(
in1_rows
.
size
(),
1
);
paddle
::
framework
::
MixVector
<
int64_t
>
mixv_in1_rows
(
&
in1_rows
);
SelectedRowsAddToTensorKernel
<
T
,
block_size
>
<<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
in1_data
,
mixv_in1_rows
.
CUDAData
(
context
.
GetPlace
()),
in2_data
,
in1_row_numel
);
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
T
>
{
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
phi
::
GPUContext
&
context
,
void
operator
()(
const
phi
::
GPUContext
&
context
,
...
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
...
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
}
};
};
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
float
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
phi
::
GPUContext
,
int
>;
...
@@ -625,34 +498,6 @@ struct MergeAddImpl {
...
@@ -625,34 +498,6 @@ struct MergeAddImpl {
}
}
};
};
template
<
typename
T
>
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi
::
SelectedRows
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
const
bool
sorted_result
)
{
return
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
phi
::
SelectedRows
&
input
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
input
,
output
,
sorted_result
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
vector
<
const
phi
::
SelectedRows
*>&
inputs
,
phi
::
SelectedRows
*
output
,
const
bool
sorted_result
)
{
MergeAddImpl
<
platform
::
CUDADeviceContext
,
T
>
()(
context
,
inputs
,
output
,
sorted_result
);
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
MergeAdd
<
phi
::
GPUContext
,
T
>
{
struct
MergeAdd
<
phi
::
GPUContext
,
T
>
{
// unary functor, merge by adding duplicated rows in
// unary functor, merge by adding duplicated rows in
...
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
...
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
}
}
};
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;
template struct MergeAdd<phi::GPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD
(
float
)
TEMPLATE_SPECIALIZED_FOR_MERGEADD
(
float
)
...
...
paddle/fluid/operators/math/sequence_padding.cu
浏览文件 @
88490567
...
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
...
@@ -57,88 +57,6 @@ __global__ void SequencePaddingKernel(T* dst,
}
}
}
}
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
seq_tensor
,
framework
::
LoDTensor
*
pad_tensor
,
const
framework
::
LoDTensor
&
pad_value
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_lod
=
seq_tensor
.
lod
();
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_lod
)[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
.
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
->
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
PADDLE_ENFORCE_GE
(
pad_seq_len
,
max_seq_len
,
platform
::
errors
::
InvalidArgument
(
"The pad_seq_len must be equal to or greater than the "
"original max sequence length. Expected %ld >= %ld, but got %ld < "
"%ld. Please check the input value."
,
pad_seq_len
,
max_seq_len
,
pad_seq_len
,
max_seq_len
));
int
step_width
=
seq_tensor
.
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
PADDLE_ENFORCE_EQ
(
pad_value
.
numel
()
==
1
||
pad_value
.
numel
()
==
step_width
,
true
,
platform
::
errors
::
InvalidArgument
(
"The numel of 'pad_value' can only be 1 or be equal to "
"the 'step_width', but got %ld != 1 and %ld. Please check the "
"input value."
,
pad_value
.
numel
(),
step_width
));
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
seq_data
=
seq_tensor
.
data
<
T
>
();
T
*
pad_data
=
pad_tensor
->
data
<
T
>
();
const
T
*
pad_value_data
=
pad_value
.
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kSeqToPad
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
pad_data
,
seq_data
,
pad_value_data
,
pad_value
.
numel
()
==
1
,
mix_vector_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
template
<
typename
T
>
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
public:
...
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
...
@@ -221,73 +139,6 @@ class PaddingLoDTensorFunctor<phi::GPUContext, T> {
}
}
};
};
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
&
pad_tensor
,
framework
::
LoDTensor
*
seq_tensor
,
int
pad_seq_len
=
-
1
,
int
lod_level
=
0
,
bool
norm_by_times
=
false
,
const
PadLayout
layout
=
kBatchLengthWidth
)
{
auto
seq_offsets
=
framework
::
ToAbsOffset
(
seq_tensor
->
lod
())[
lod_level
];
const
auto
&
seq_tensor_dims
=
seq_tensor
->
dims
();
const
auto
&
pad_tensor_dims
=
pad_tensor
.
dims
();
int
max_seq_len
=
MaximumSequenceLength
(
seq_offsets
);
if
(
pad_seq_len
==
-
1
)
{
pad_seq_len
=
max_seq_len
;
}
int
step_width
=
seq_tensor
->
numel
()
/
seq_tensor_dims
[
0
];
int
seq_num
=
seq_offsets
.
size
()
-
1
;
CheckDims
(
seq_tensor_dims
,
pad_tensor_dims
,
seq_offsets
,
pad_seq_len
,
step_width
,
layout
);
/*
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
seq_tensor);
seq_tensor->Resize(seq_tensor_dims);
return;
}
*/
const
int
kBlockSize
=
512
;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t
block_dim_x
=
std
::
min
(((((
step_width
+
7
)
>>
3
)
+
31
)
>>
5
)
<<
5
,
kBlockSize
);
size_t
block_dim_y
=
kBlockSize
/
block_dim_x
;
dim3
threads
(
block_dim_x
,
block_dim_y
);
size_t
grid_dim_x
=
(
pad_seq_len
+
block_dim_y
-
1
)
/
block_dim_y
;
size_t
grid_dim_y
=
seq_num
;
dim3
grid
(
grid_dim_x
,
grid_dim_y
);
const
T
*
pad_data
=
pad_tensor
.
data
<
T
>
();
T
*
seq_data
=
seq_tensor
->
data
<
T
>
();
paddle
::
framework
::
MixVector
<
size_t
>
mixv_seq_offsets
(
&
seq_offsets
);
SequencePaddingKernel
<
T
,
kPadToSeq
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
seq_data
,
pad_data
,
nullptr
,
false
,
mixv_seq_offsets
.
CUDAData
(
context
.
GetPlace
()),
seq_num
,
pad_seq_len
,
step_width
,
norm_by_times
,
layout
);
}
};
template
<
typename
T
>
template
<
typename
T
>
class
UnpaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
class
UnpaddingLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
public:
...
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
...
@@ -355,16 +206,6 @@ class UnpaddingLoDTensorFunctor<phi::GPUContext, T> {
}
}
};
};
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
UnpaddingLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
int64_t
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
PaddingLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
...
...
paddle/fluid/operators/math/sequence_scale.cu
浏览文件 @
88490567
...
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
...
@@ -35,43 +35,6 @@ __global__ void SequenceScaleKernel(T* seq,
}
}
}
}
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
T
*
scales
,
framework
::
LoDTensor
*
seq
)
{
const
size_t
level
=
0
;
auto
lod
=
seq
->
lod
();
const
size_t
num_seq
=
lod
[
level
].
size
()
-
1
;
const
size_t
seq_width
=
seq
->
numel
()
/
seq
->
dims
()[
0
];
auto
abs_offset_lod
=
framework
::
ToAbsOffset
(
lod
);
T
*
seq_data
=
seq
->
mutable_data
<
T
>
(
context
.
GetPlace
());
paddle
::
framework
::
MixVector
<
size_t
>
mix_vector
(
&
(
abs_offset_lod
[
level
]));
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
),
dim3
(
num_seq
),
dim3
(
PADDLE_CUDA_NUM_THREADS
),
0
,
context
.
stream
(),
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#else
SequenceScaleKernel
<
T
,
PADDLE_CUDA_NUM_THREADS
>
<<<
num_seq
,
PADDLE_CUDA_NUM_THREADS
,
0
,
context
.
stream
()
>>>
(
seq_data
,
mix_vector
.
CUDAMutableData
(
context
.
GetPlace
()),
scales
,
seq_width
);
#endif
mix_vector
.
CopyToCPU
();
}
};
template
<
typename
T
>
template
<
typename
T
>
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
T
>
{
public:
public:
...
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
...
@@ -109,9 +72,6 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
}
}
};
};
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
float
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
double
>;
template
class
ScaleLoDTensorFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
88490567
...
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
...
@@ -141,56 +141,21 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
#endif
#endif
}
}
template
class
SoftmaxCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
,
phi
::
GPUContext
>;
#if CUDNN_VERSION_MIN(8, 1, 0)
#if CUDNN_VERSION_MIN(8, 1, 0)
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
bfloat16
,
phi
::
GPUContext
>;
#endif
#endif
// MIOPEN do not support double
// MIOPEN do not support double
#ifndef PADDLE_WITH_HIP
#ifndef PADDLE_WITH_HIP
template
class
SoftmaxCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
platform
::
CUDADeviceContext
>;
template
class
SoftmaxCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
#endif
#endif
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
false
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
,
true
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
,
true
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
float16
,
true
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
bfloat16
,
false
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
platform
::
bfloat16
,
false
>;
...
...
paddle/fluid/operators/math/vol2col.cu
浏览文件 @
88490567
...
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
...
@@ -417,13 +417,9 @@ void Col2VolFunctor<DeviceContext, T>::operator()(
}
}
// };
// };
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Vol2ColFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Vol2ColFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2VolFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
float
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
double
>;
template
class
Col2VolFunctor
<
phi
::
GPUContext
,
double
>;
...
...
paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
浏览文件 @
88490567
...
@@ -16,12 +16,6 @@
...
@@ -16,12 +16,6 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
platform
{
class
CUDADeviceContext
;
}
// namespace platform
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
sequence_concat
,
sequence_concat
,
paddle
::
operators
::
SeqConcatKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
operators
::
SeqConcatKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
88490567
...
@@ -51,7 +51,6 @@ namespace platform {
...
@@ -51,7 +51,6 @@ namespace platform {
//
//
// The NCCLComm instance is created and reversed in the NCCLCommContext
// The NCCLComm instance is created and reversed in the NCCLCommContext
// singleton with a global user specified group id.
// singleton with a global user specified group id.
class
CUDADeviceContext
;
class
NCCLComm
{
class
NCCLComm
{
public:
public:
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
88490567
...
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
...
@@ -533,11 +533,6 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_
=
memory
::
Alloc
(
device_context_
,
required_workspace_bytes
);
allocation_
=
memory
::
Alloc
(
device_context_
,
required_workspace_bytes
);
}
}
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
phi
::
GPUContext
(
place
)
{}
CUDADeviceContext
::~
CUDADeviceContext
()
=
default
;
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
CUDAPinnedDeviceContext
::
CUDAPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
88490567
...
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
...
@@ -271,15 +271,7 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
class
CudnnWorkspaceHandle
;
class
CudnnWorkspaceHandle
;
class
EigenCudaStreamDevice
;
class
EigenCudaStreamDevice
;
class
CUDADeviceContext
:
public
phi
::
GPUContext
{
using
CUDADeviceContext
=
phi
::
GPUContext
;
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
virtual
~
CUDADeviceContext
();
private:
int
place_holder_
;
// TO BE REMOVED
DISABLE_COPY_AND_ASSIGN
(
CUDADeviceContext
);
};
class
CudnnWorkspaceHandle
{
class
CudnnWorkspaceHandle
{
public:
public:
...
...
paddle/fluid/platform/transform.h
浏览文件 @
88490567
...
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
...
@@ -96,66 +96,6 @@ struct Transform<phi::CPUContext> {
};
};
#if defined(__NVCC__) || defined(__HIPCC__)
#if defined(__NVCC__) || defined(__HIPCC__)
template
<
>
struct
Transform
<
platform
::
CUDADeviceContext
>
{
template
<
typename
InputIter
,
typename
OutputIter
,
typename
UnaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter
first
,
InputIter
last
,
OutputIter
result
,
UnaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first
),
details
::
CastToCUDATransformIterator
(
last
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
template
<
typename
InputIter1
,
typename
InputIter2
,
typename
OutputIter
,
typename
BinaryOperation
>
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
InputIter1
first1
,
InputIter1
last1
,
InputIter2
first2
,
OutputIter
result
,
BinaryOperation
op
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
is_gpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The CUDA Transform must be used in GPU place."
));
#ifdef __HIPCC__
thrust
::
transform
(
thrust
::
hip
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#else
thrust
::
transform
(
thrust
::
cuda
::
par
.
on
(
context
.
stream
()),
details
::
CastToCUDATransformIterator
(
first1
),
details
::
CastToCUDATransformIterator
(
last1
),
details
::
CastToCUDATransformIterator
(
first2
),
details
::
CastToCUDATransformIterator
(
result
),
op
);
#endif
}
};
template
<
>
template
<
>
struct
Transform
<
phi
::
GPUContext
>
{
struct
Transform
<
phi
::
GPUContext
>
{
...
...
paddle/phi/kernels/funcs/blas/blas_impl.cu.h
浏览文件 @
88490567
此差异已折叠。
点击以展开。
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
浏览文件 @
88490567
此差异已折叠。
点击以展开。
paddle/phi/kernels/funcs/fc_functor.cu
浏览文件 @
88490567
...
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
...
@@ -313,10 +313,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
AddReluKernel
(
context
.
stream
(),
M
,
N
,
Y
,
B
,
relu
);
AddReluKernel
(
context
.
stream
(),
M
,
N
,
Y
,
B
,
relu
);
}
}
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float16
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
FCFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
class
FCFunctor
<
GPUContext
,
float16
>;
template
class
FCFunctor
<
GPUContext
,
float16
>;
template
class
FCFunctor
<
GPUContext
,
float
>;
template
class
FCFunctor
<
GPUContext
,
float
>;
template
class
FCFunctor
<
GPUContext
,
double
>;
template
class
FCFunctor
<
GPUContext
,
double
>;
...
...
paddle/phi/kernels/funcs/for_range.h
浏览文件 @
88490567
...
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
...
@@ -91,22 +91,6 @@ struct ForRange<phi::GPUContext> {
size_t
limit_
;
size_t
limit_
;
};
};
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template
<
>
struct
ForRange
<
paddle
::
platform
::
CUDADeviceContext
>
{
ForRange
(
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx
,
size_t
limit
)
:
dev_ctx_
(
dev_ctx
),
limit_
(
limit
)
{}
template
<
typename
Function
>
inline
void
operator
()(
Function
func
)
const
{
phi
::
funcs
::
ForRange
<
phi
::
GPUContext
>
for_range
(
dev_ctx_
,
limit_
);
for_range
(
func
);
}
const
paddle
::
platform
::
CUDADeviceContext
&
dev_ctx_
;
size_t
limit_
;
};
#endif
#endif
}
// namespace funcs
}
// namespace funcs
...
...
paddle/phi/kernels/funcs/math_function.cu
浏览文件 @
88490567
...
@@ -31,22 +31,6 @@ namespace funcs {
...
@@ -31,22 +31,6 @@ namespace funcs {
using
float16
=
phi
::
dtype
::
float16
;
using
float16
=
phi
::
dtype
::
float16
;
using
bfloat16
=
phi
::
dtype
::
bfloat16
;
using
bfloat16
=
phi
::
dtype
::
bfloat16
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int16_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
bool
>;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
float
>
>
;
template
struct
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
phi
::
dtype
::
complex
<
double
>
>
;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
float16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
phi
::
dtype
::
bfloat16
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
float
>;
template
struct
SetConstant
<
phi
::
GPUContext
,
float
>;
...
@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
...
@@ -75,44 +59,18 @@ template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
template
struct
SetConstant
<
paddle
::
platform
::
CUDAPinnedDeviceContext
,
template
struct
SetConstant
<
paddle
::
platform
::
CUDAPinnedDeviceContext
,
phi
::
dtype
::
complex
<
double
>
>
;
phi
::
dtype
::
complex
<
double
>
>
;
#define DEFINE_GPU_TRANS(RANK) \
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
template struct Transpose<phi::GPUContext, double, RANK>; \
double, \
template struct Transpose<phi::GPUContext, float16, RANK>; \
RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
float16, \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
template struct Transpose<phi::GPUContext, \
bfloat16, \
phi::dtype::complex<float>, \
RANK>; \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int8_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int32_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
int64_t, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<paddle::platform::CUDADeviceContext, \
phi::dtype::complex<double>, \
RANK>; \
template struct Transpose<phi::GPUContext, bool, RANK>; \
template struct Transpose<phi::GPUContext, float, RANK>; \
template struct Transpose<phi::GPUContext, double, RANK>; \
template struct Transpose<phi::GPUContext, float16, RANK>; \
template struct Transpose<phi::GPUContext, bfloat16, RANK>; \
template struct Transpose<phi::GPUContext, int8_t, RANK>; \
template struct Transpose<phi::GPUContext, int32_t, RANK>; \
template struct Transpose<phi::GPUContext, int64_t, RANK>; \
template struct Transpose<phi::GPUContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
template struct Transpose<phi::GPUContext, phi::dtype::complex<double>, RANK>;
DEFINE_GPU_TRANS
(
1
);
DEFINE_GPU_TRANS
(
1
);
...
@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
...
@@ -240,8 +198,7 @@ struct TransposeNormal<phi::GPUContext, T> {
};
};
// define transpose normal
// define transpose normal
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
template struct TransposeNormal<phi::GPUContext, TYPE>
template struct TransposeNormal<phi::GPUContext, TYPE>
DEFINE_GPU_TRANS_NORMAL
(
float16
);
DEFINE_GPU_TRANS_NORMAL
(
float16
);
...
...
paddle/phi/kernels/funcs/matrix_inverse.cu.cc
浏览文件 @
88490567
...
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
...
@@ -131,10 +131,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
template
class
MatrixInverseFunctor
<
GPUContext
,
float
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
float
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
double
>;
template
class
MatrixInverseFunctor
<
GPUContext
,
double
>;
// TODO(chenweihang): remove these instantiations later
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixInverseFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace funcs
}
// namespace phi
}
// namespace phi
paddle/phi/kernels/funcs/matrix_solve.cu
浏览文件 @
88490567
...
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
...
@@ -170,9 +170,5 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
template
class
MatrixSolveFunctor
<
GPUContext
,
float
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
float
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
double
>;
template
class
MatrixSolveFunctor
<
GPUContext
,
double
>;
// TODO(wuweilong): remove these instantiations later
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
float
>;
template
class
MatrixSolveFunctor
<
paddle
::
platform
::
CUDADeviceContext
,
double
>;
}
// namespace funcs
}
// namespace funcs
}
// namespace phi
}
// namespace phi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录