Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
580447d0
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
580447d0
编写于
2月 25, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 25, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part4), test=develop (#31013)
上级
7d91974c
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
137 addition
and
56 deletion
+137
-56
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+31
-5
paddle/fluid/framework/array.h
paddle/fluid/framework/array.h
+14
-4
paddle/fluid/framework/conv_search_cache.h
paddle/fluid/framework/conv_search_cache.h
+27
-1
paddle/fluid/framework/copy_same_tensor_test.cc
paddle/fluid/framework/copy_same_tensor_test.cc
+1
-1
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+5
-2
paddle/fluid/framework/data_feed.h
paddle/fluid/framework/data_feed.h
+1
-1
paddle/fluid/framework/data_feed_factory.cc
paddle/fluid/framework/data_feed_factory.cc
+1
-1
paddle/fluid/framework/data_type_transform.cc
paddle/fluid/framework/data_type_transform.cc
+1
-1
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+1
-1
paddle/fluid/framework/details/broadcast_op_handle.h
paddle/fluid/framework/details/broadcast_op_handle.h
+4
-4
paddle/fluid/framework/details/broadcast_op_handle_test.cc
paddle/fluid/framework/details/broadcast_op_handle_test.cc
+2
-1
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+5
-5
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+23
-20
paddle/fluid/framework/device_worker_factory.cc
paddle/fluid/framework/device_worker_factory.cc
+5
-3
paddle/fluid/framework/dim_test.cu
paddle/fluid/framework/dim_test.cu
+10
-0
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+2
-2
paddle/fluid/framework/dlpack_tensor_test.cc
paddle/fluid/framework/dlpack_tensor_test.cc
+1
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+1
-1
paddle/fluid/framework/generator.cc
paddle/fluid/framework/generator.cc
+2
-2
未找到文件。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
580447d0
...
...
@@ -34,7 +34,11 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
cc_library
(
ddim SRCS ddim.cc DEPS eigen3 boost enforce
)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
if
(
WITH_GPU
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
elseif
(
WITH_ROCM
)
hip_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
endif
()
cc_test
(
unroll_array_ops_test SRCS unroll_array_ops_test.cc
)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
...
...
@@ -46,6 +50,8 @@ if(WITH_GPU)
else
()
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler
)
endif
(
WIN32
)
elseif
(
WITH_ROCM
)
hip_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler
)
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler
)
endif
()
...
...
@@ -53,6 +59,8 @@ endif()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
if
(
WITH_GPU
)
nv_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor
)
else
()
cc_test
(
tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor
)
endif
()
...
...
@@ -63,13 +71,20 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
if
(
WITH_GPU
)
nv_test
(
mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor
)
else
()
cc_test
(
mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor
)
endif
()
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version
)
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
nv_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
if
(
WITH_GPU
)
nv_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
endif
()
cc_library
(
garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog
)
...
...
@@ -94,8 +109,13 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test
(
variable_test SRCS variable_test.cc DEPS tensor var_type_traits
)
cc_library
(
data_device_transform SRCS data_device_transform.cc DEPS tensor
)
nv_test
(
data_device_transform_test SRCS data_device_transform_test.cu
if
(
WITH_GPU
)
nv_test
(
data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope
)
elseif
(
WITH_ROCM
)
hip_test
(
data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope
)
endif
()
if
(
WITH_GPU
)
if
(
WIN32
)
...
...
@@ -108,6 +128,9 @@ if(WITH_GPU)
nv_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
endif
(
WIN32
)
nv_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
elseif
(
WITH_ROCM
)
hip_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
hip_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
else
()
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
cc_test
(
data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform
)
...
...
@@ -156,8 +179,11 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
cc_library
(
op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce
)
cc_test
(
op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
if
(
WITH_GPU
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
elseif
(
WITH_ROCM
)
hip_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
endif
()
if
(
WITH_PYTHON
)
py_proto_compile
(
framework_py_proto SRCS framework.proto data_feed.proto
)
...
...
paddle/fluid/framework/array.h
浏览文件 @
580447d0
...
...
@@ -54,7 +54,7 @@ class Array {
}
HOSTDEVICE
inline
T
&
at
(
size_t
i
)
{
#if
ndef __CUDA_ARCH__
#if
!defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT
(
i
,
N
,
platform
::
errors
::
OutOfRange
(
"Array index out of bounds."
));
#endif
...
...
@@ -62,7 +62,7 @@ class Array {
}
HOSTDEVICE
inline
const
T
&
at
(
size_t
i
)
const
{
#if
ndef __CUDA_ARCH__
#if
!defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT
(
i
,
N
,
platform
::
errors
::
OutOfRange
(
"Array index out of bounds."
));
#endif
...
...
@@ -103,7 +103,12 @@ class Array<T, 0> {
HOSTDEVICE
inline
T
*
GetMutable
()
{
return
nullptr
;
}
HOSTDEVICE
inline
T
&
operator
[](
size_t
)
{
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static
T
obj
{};
return
obj
;
#elif defined(__CUDA_ARCH__)
static
T
obj
();
return
obj
;
#else
...
...
@@ -112,7 +117,12 @@ class Array<T, 0> {
}
HOSTDEVICE
inline
const
T
&
operator
[](
size_t
)
const
{
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static
const
T
obj
{};
return
obj
;
#elif defined(__CUDA_ARCH__)
static
const
T
obj
();
return
obj
;
#else
...
...
paddle/fluid/framework/conv_search_cache.h
浏览文件 @
580447d0
...
...
@@ -16,7 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#else
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -32,7 +37,20 @@ class ConvSearchCache {
static
ConvSearchCache
instance
;
return
instance
;
}
#ifdef PADDLE_WITH_HIP
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>*
GetForward
()
{
return
&
forward_cache_
;
}
AlgorithmsCache
<
miopenConvBwdDataAlgorithm_t
>*
GetBackwardData
()
{
return
&
backward_data_cache_
;
}
AlgorithmsCache
<
miopenConvBwdWeightsAlgorithm_t
>*
GetBackwardFilter
()
{
return
&
backward_filter_cache_
;
}
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>*
GetConvFusion
()
{
return
&
fusion_forward_cache_
;
}
#else
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
GetForward
()
{
return
&
forward_cache_
;
}
...
...
@@ -45,6 +63,7 @@ class ConvSearchCache {
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
GetConvFusion
()
{
return
&
fusion_forward_cache_
;
}
#endif
private:
ConvSearchCache
()
{}
...
...
@@ -52,10 +71,17 @@ class ConvSearchCache {
ConvSearchCache
(
const
ConvSearchCache
&
)
{}
ConvSearchCache
&
operator
=
(
const
ConvSearchCache
&
)
{}
#ifdef PADDLE_WITH_HIP
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>
forward_cache_
;
AlgorithmsCache
<
miopenConvBwdDataAlgorithm_t
>
backward_data_cache_
;
AlgorithmsCache
<
miopenConvBwdWeightsAlgorithm_t
>
backward_filter_cache_
;
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>
fusion_forward_cache_
;
#else
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
forward_cache_
;
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
backward_data_cache_
;
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
backward_filter_cache_
;
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
fusion_forward_cache_
;
#endif
};
}
// namespace framework
...
...
paddle/fluid/framework/copy_same_tensor_test.cc
浏览文件 @
580447d0
...
...
@@ -31,7 +31,7 @@ namespace framework {
static
std
::
vector
<
platform
::
Place
>
CreatePlaceList
()
{
std
::
vector
<
platform
::
Place
>
places
;
places
.
emplace_back
(
platform
::
CPUPlace
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
places
.
emplace_back
(
platform
::
CUDAPlace
(
0
));
#endif
return
places
;
...
...
paddle/fluid/framework/data_feed.cc
浏览文件 @
580447d0
...
...
@@ -151,9 +151,12 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
}
else
{
#ifdef PADDLE_WITH_CUDA
cudaMemcpy
(
dst
,
src
,
size
,
cudaMemcpyHostToDevice
);
#elif defined(PADDLE_WITH_HIP)
hipMemcpy
(
dst
,
src
,
size
,
hipMemcpyHostToDevice
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Not supported GPU, please compile with option WITH_GPU=ON."
));
"Not supported GPU/ROCM, please compile with option WITH_GPU=ON or "
"WITH_ROCM=ON."
));
#endif
}
}
...
...
@@ -1157,7 +1160,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
#endif
}
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
template
<
typename
T
>
void
PrivateInstantDataFeed
<
T
>::
PutToFeedVec
()
{
for
(
size_t
i
=
0
;
i
<
use_slots_
.
size
();
++
i
)
{
...
...
paddle/fluid/framework/data_feed.h
浏览文件 @
580447d0
...
...
@@ -716,7 +716,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
int
pv_batch_size_
;
};
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
template
<
typename
T
>
class
PrivateInstantDataFeed
:
public
DataFeed
{
public:
...
...
paddle/fluid/framework/data_feed_factory.cc
浏览文件 @
580447d0
...
...
@@ -68,7 +68,7 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
REGISTER_DATAFEED_CLASS
(
MultiSlotDataFeed
);
REGISTER_DATAFEED_CLASS
(
MultiSlotInMemoryDataFeed
);
REGISTER_DATAFEED_CLASS
(
PaddleBoxDataFeed
);
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
REGISTER_DATAFEED_CLASS
(
MultiSlotFileInstantDataFeed
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/data_type_transform.cc
浏览文件 @
580447d0
...
...
@@ -47,7 +47,7 @@ struct CastDataType {
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
}
else
if
(
platform
::
is_gpu_place
(
in_
.
place
()))
{
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
ctx_
);
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
580447d0
...
...
@@ -81,7 +81,7 @@ void BroadcastOpHandle::BroadcastOneVar(
});
}
}
else
if
(
platform
::
is_gpu_place
(
in_tensor
.
place
()))
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
VarHandle
*
out_handle
=
nullptr
;
int
root_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
in_tensor
.
place
()).
device
;
...
...
paddle/fluid/framework/details/broadcast_op_handle.h
浏览文件 @
580447d0
...
...
@@ -34,7 +34,7 @@ class Node;
}
// namespace ir
}
// namespace framework
namespace
platform
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
struct
NCCLContextMap
;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
...
...
@@ -43,7 +43,7 @@ struct BKCLContextMap;
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
...
...
@@ -55,7 +55,7 @@ namespace details {
struct
BroadcastOpHandle
:
public
OpHandleBase
{
public:
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
BroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
nccl_ctxs
)
...
...
@@ -106,7 +106,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#elif defined(PADDLE_WITH_XPU_BKCL)
const
platform
::
BKCLContextMap
*
bkcl_ctxs_
;
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.cc
浏览文件 @
580447d0
...
...
@@ -36,7 +36,8 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
test_op
.
TestBroadcastSelectedRows
(
input_scope_idx
);
}
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST
(
BroadcastTester
,
TestGPUBroadcastTestLodTensor
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.h
浏览文件 @
580447d0
...
...
@@ -48,7 +48,7 @@ struct TestBroadcastOpHandle {
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
vector
<
p
::
Place
>
place_list_
;
DeviceType
use_device_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
...
...
@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
...
...
@@ -94,7 +94,7 @@ struct TestBroadcastOpHandle {
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL."
));
#endif
}
else
if
(
use_device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
...
...
@@ -122,7 +122,7 @@ struct TestBroadcastOpHandle {
#if defined(PADDLE_WITH_XPU_BKCL)
bkcl_ctxs_
.
reset
(
nullptr
);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
...
...
@@ -143,7 +143,7 @@ struct TestBroadcastOpHandle {
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
#else
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
580447d0
...
...
@@ -52,7 +52,7 @@ class DeviceContext;
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
...
...
@@ -73,11 +73,12 @@ class PullDenseWorker {
public:
virtual
~
PullDenseWorker
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
param
);
#if
def PADDLE_WITH_CUDA
void
AddStream
(
const
cuda
Stream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
AddStream
(
const
gpu
Stream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
void
AddPlace
(
const
paddle
::
platform
::
Place
place
)
{
places_
.
push_back
(
place
);
}
...
...
@@ -137,8 +138,8 @@ class PullDenseWorker {
float
total_batch_num_
=
0
;
std
::
unordered_map
<
const
Scope
*
,
int
>
scope_to_thread_id_
;
#if
def PADDLE_WITH_CUDA
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
#endif
std
::
vector
<
paddle
::
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
thread_scopes_
;
...
...
@@ -167,9 +168,9 @@ class DeviceWorker {
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{}
virtual
void
ProduceTasks
()
{}
virtual
void
GetXpuOpIndex
()
{}
#if
def PADDLE_WITH_CUDA
virtual
void
SetStream
(
const
cuda
Stream_t
stream
)
{}
virtual
void
SetEvent
(
const
cuda
Event_t
event
)
{}
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
virtual
void
SetStream
(
const
gpu
Stream_t
stream
)
{}
virtual
void
SetEvent
(
const
gpu
Event_t
event
)
{}
#endif
virtual
void
SetNeedDumpField
(
bool
need_dump_field
)
{
need_dump_field_
=
need_dump_field
;
...
...
@@ -437,7 +438,8 @@ class HeterCpuWorker : public HogwildWorker {
};
#endif
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class
HeterBoxWorker
:
public
HogwildWorker
{
public:
...
...
@@ -452,8 +454,8 @@ class HeterBoxWorker : public HogwildWorker {
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
void
ProduceTasks
()
override
;
virtual
void
SetStream
(
const
cuda
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
cuda
Event_t
event
)
{
event_
=
event
;
}
virtual
void
SetStream
(
const
gpu
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
gpu
Event_t
event
)
{
event_
=
event
;
}
virtual
void
TrainFilesWithProfiler
()
{}
void
ResetStat
();
...
...
@@ -515,8 +517,8 @@ class HeterBoxWorker : public HogwildWorker {
std
::
unordered_map
<
uint64_t
,
std
::
unordered_set
<
uint64_t
>>
feasign_set_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
pull_queue_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
push_queue_
;
cuda
Event_t
event_
;
cuda
Stream_t
copy_stream_
;
gpu
Event_t
event_
;
gpu
Stream_t
copy_stream_
;
int
batch_cnt_
{
0
};
std
::
atomic
<
int
>
done_cnt_
{
0
};
...
...
@@ -537,7 +539,8 @@ class HeterBoxWorker : public HogwildWorker {
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class
PSGPUWorker
:
public
HogwildWorker
{
public:
PSGPUWorker
()
{}
...
...
@@ -551,8 +554,8 @@ class PSGPUWorker : public HogwildWorker {
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
void
ProduceTasks
()
override
;
virtual
void
SetStream
(
const
cuda
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
cuda
Event_t
event
)
{
event_
=
event
;
}
virtual
void
SetStream
(
const
gpu
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
gpu
Event_t
event
)
{
event_
=
event
;
}
virtual
void
TrainFilesWithProfiler
()
{}
void
ResetStat
();
...
...
@@ -611,8 +614,8 @@ class PSGPUWorker : public HogwildWorker {
std
::
unordered_map
<
uint64_t
,
std
::
unordered_set
<
uint64_t
>>
feasign_set_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
pull_queue_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
push_queue_
;
cuda
Event_t
event_
;
cuda
Stream_t
copy_stream_
;
gpu
Event_t
event_
;
gpu
Stream_t
copy_stream_
;
int
batch_cnt_
{
0
};
std
::
atomic
<
int
>
done_cnt_
{
0
};
...
...
@@ -633,7 +636,7 @@ class PSGPUWorker : public HogwildWorker {
};
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
SectionWorker
:
public
DeviceWorker
{
public:
SectionWorker
()
{}
...
...
paddle/fluid/framework/device_worker_factory.cc
浏览文件 @
580447d0
...
...
@@ -69,15 +69,17 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
REGISTER_DEVICE_WORKER_CLASS
(
HeterCpuWorker
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS
(
HeterBoxWorker
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS
(
PSGPUWorker
);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
REGISTER_DEVICE_WORKER_CLASS
(
SectionWorker
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/dim_test.cu
浏览文件 @
580447d0
...
...
@@ -34,7 +34,12 @@ TEST(Dim, Equality) {
// construct a Dim on the GPU
thrust
::
device_vector
<
paddle
::
framework
::
Dim
<
2
>>
t
(
2
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
dim3
(
1
),
dim3
(
1
),
0
,
0
,
thrust
::
raw_pointer_cast
(
t
.
data
()));
#else
test
<<<
1
,
1
>>>
(
thrust
::
raw_pointer_cast
(
t
.
data
()));
#endif
a
=
t
[
0
];
EXPECT_EQ
(
a
[
0
],
5
);
EXPECT_EQ
(
a
[
1
],
6
);
...
...
@@ -55,7 +60,12 @@ TEST(Dim, Equality) {
// dynamic access on GPU
thrust
::
device_vector
<
int64_t
>
r
(
1
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
dyn_idx_gpu
,
dim3
(
1
),
dim3
(
1
),
0
,
0
,
thrust
::
raw_pointer_cast
(
r
.
data
()));
#else
dyn_idx_gpu
<<<
1
,
1
>>>
(
thrust
::
raw_pointer_cast
(
r
.
data
()));
#endif
int64_t
res
=
r
[
0
];
EXPECT_EQ
(
res
,
6
);
}
...
...
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
580447d0
...
...
@@ -83,7 +83,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPlace
&
place
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::
DLContext
ctx
;
ctx
.
device_type
=
kDLGPU
;
ctx
.
device_id
=
place
.
device
;
...
...
@@ -95,7 +95,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPinnedPlace
&
place
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::
DLContext
ctx
;
ctx
.
device_type
=
kDLCPUPinned
;
ctx
.
device_id
=
0
;
...
...
paddle/fluid/framework/dlpack_tensor_test.cc
浏览文件 @
580447d0
...
...
@@ -103,7 +103,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place,
template
<
typename
T
>
void
TestMainLoop
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
platform
::
Place
>
places
{
platform
::
CPUPlace
(),
platform
::
CUDAPlace
(
0
),
platform
::
CUDAPinnedPlace
()};
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
580447d0
...
...
@@ -431,7 +431,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
std
::
unique_ptr
<
GarbageCollector
>
gc
;
if
(
!
ctx
->
force_disable_gc_
&&
max_memory_size
>=
0
)
{
if
(
platform
::
is_gpu_place
(
place_
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
max_memory_size
));
...
...
paddle/fluid/framework/generator.cc
浏览文件 @
580447d0
...
...
@@ -25,7 +25,7 @@ namespace paddle {
namespace
framework
{
const
std
::
shared_ptr
<
Generator
>&
GetDefaultCUDAGenerator
(
int64_t
device_id
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
static
int64_t
num_cuda_devices
=
-
1
;
static
std
::
once_flag
num_devices_init_flag
;
...
...
@@ -157,7 +157,7 @@ uint64_t Generator::Random64() {
std
::
pair
<
uint64_t
,
uint64_t
>
Generator
::
IncrementOffset
(
uint64_t
increament_offset
)
{
uint64_t
cur_offset
=
this
->
state_
.
thread_offset
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
lock_guard
<
std
::
mutex
>
lock
(
this
->
mu_
);
this
->
state_
.
thread_offset
+=
increament_offset
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录