Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
580447d0
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
580447d0
编写于
2月 25, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 25, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part4), test=develop (#31013)
上级
7d91974c
变更
19
显示空白变更内容
内联
并排
Showing
19 changed file
with
137 addition
and
56 deletion
+137
-56
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+31
-5
paddle/fluid/framework/array.h
paddle/fluid/framework/array.h
+14
-4
paddle/fluid/framework/conv_search_cache.h
paddle/fluid/framework/conv_search_cache.h
+27
-1
paddle/fluid/framework/copy_same_tensor_test.cc
paddle/fluid/framework/copy_same_tensor_test.cc
+1
-1
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+5
-2
paddle/fluid/framework/data_feed.h
paddle/fluid/framework/data_feed.h
+1
-1
paddle/fluid/framework/data_feed_factory.cc
paddle/fluid/framework/data_feed_factory.cc
+1
-1
paddle/fluid/framework/data_type_transform.cc
paddle/fluid/framework/data_type_transform.cc
+1
-1
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+1
-1
paddle/fluid/framework/details/broadcast_op_handle.h
paddle/fluid/framework/details/broadcast_op_handle.h
+4
-4
paddle/fluid/framework/details/broadcast_op_handle_test.cc
paddle/fluid/framework/details/broadcast_op_handle_test.cc
+2
-1
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+5
-5
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+23
-20
paddle/fluid/framework/device_worker_factory.cc
paddle/fluid/framework/device_worker_factory.cc
+5
-3
paddle/fluid/framework/dim_test.cu
paddle/fluid/framework/dim_test.cu
+10
-0
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+2
-2
paddle/fluid/framework/dlpack_tensor_test.cc
paddle/fluid/framework/dlpack_tensor_test.cc
+1
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+1
-1
paddle/fluid/framework/generator.cc
paddle/fluid/framework/generator.cc
+2
-2
未找到文件。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
580447d0
...
...
@@ -34,7 +34,11 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
cc_library
(
ddim SRCS ddim.cc DEPS eigen3 boost enforce
)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
if
(
WITH_GPU
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
elseif
(
WITH_ROCM
)
hip_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
endif
()
cc_test
(
unroll_array_ops_test SRCS unroll_array_ops_test.cc
)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
...
...
@@ -46,6 +50,8 @@ if(WITH_GPU)
else
()
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler
)
endif
(
WIN32
)
elseif
(
WITH_ROCM
)
hip_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler
)
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler
)
endif
()
...
...
@@ -53,6 +59,8 @@ endif()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
if
(
WITH_GPU
)
nv_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor
)
else
()
cc_test
(
tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor
)
endif
()
...
...
@@ -63,13 +71,20 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
if
(
WITH_GPU
)
nv_test
(
mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor
)
else
()
cc_test
(
mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor
)
endif
()
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version
)
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
nv_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
if
(
WITH_GPU
)
nv_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
elseif
(
WITH_ROCM
)
hip_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
endif
()
cc_library
(
garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog
)
...
...
@@ -94,8 +109,13 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test
(
variable_test SRCS variable_test.cc DEPS tensor var_type_traits
)
cc_library
(
data_device_transform SRCS data_device_transform.cc DEPS tensor
)
nv_test
(
data_device_transform_test SRCS data_device_transform_test.cu
if
(
WITH_GPU
)
nv_test
(
data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope
)
elseif
(
WITH_ROCM
)
hip_test
(
data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope
)
endif
()
if
(
WITH_GPU
)
if
(
WIN32
)
...
...
@@ -108,6 +128,9 @@ if(WITH_GPU)
nv_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
endif
(
WIN32
)
nv_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
elseif
(
WITH_ROCM
)
hip_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
hip_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
else
()
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
cc_test
(
data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform
)
...
...
@@ -156,8 +179,11 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
cc_library
(
op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce
)
cc_test
(
op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
if
(
WITH_GPU
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
elseif
(
WITH_ROCM
)
hip_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
endif
()
if
(
WITH_PYTHON
)
py_proto_compile
(
framework_py_proto SRCS framework.proto data_feed.proto
)
...
...
paddle/fluid/framework/array.h
浏览文件 @
580447d0
...
...
@@ -54,7 +54,7 @@ class Array {
}
HOSTDEVICE
inline
T
&
at
(
size_t
i
)
{
#if
ndef __CUDA_ARCH__
#if
!defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT
(
i
,
N
,
platform
::
errors
::
OutOfRange
(
"Array index out of bounds."
));
#endif
...
...
@@ -62,7 +62,7 @@ class Array {
}
HOSTDEVICE
inline
const
T
&
at
(
size_t
i
)
const
{
#if
ndef __CUDA_ARCH__
#if
!defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT
(
i
,
N
,
platform
::
errors
::
OutOfRange
(
"Array index out of bounds."
));
#endif
...
...
@@ -103,7 +103,12 @@ class Array<T, 0> {
HOSTDEVICE
inline
T
*
GetMutable
()
{
return
nullptr
;
}
HOSTDEVICE
inline
T
&
operator
[](
size_t
)
{
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static
T
obj
{};
return
obj
;
#elif defined(__CUDA_ARCH__)
static
T
obj
();
return
obj
;
#else
...
...
@@ -112,7 +117,12 @@ class Array<T, 0> {
}
HOSTDEVICE
inline
const
T
&
operator
[](
size_t
)
const
{
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static
const
T
obj
{};
return
obj
;
#elif defined(__CUDA_ARCH__)
static
const
T
obj
();
return
obj
;
#else
...
...
paddle/fluid/framework/conv_search_cache.h
浏览文件 @
580447d0
...
...
@@ -16,7 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#else
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -32,7 +37,20 @@ class ConvSearchCache {
static
ConvSearchCache
instance
;
return
instance
;
}
#ifdef PADDLE_WITH_HIP
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>*
GetForward
()
{
return
&
forward_cache_
;
}
AlgorithmsCache
<
miopenConvBwdDataAlgorithm_t
>*
GetBackwardData
()
{
return
&
backward_data_cache_
;
}
AlgorithmsCache
<
miopenConvBwdWeightsAlgorithm_t
>*
GetBackwardFilter
()
{
return
&
backward_filter_cache_
;
}
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>*
GetConvFusion
()
{
return
&
fusion_forward_cache_
;
}
#else
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
GetForward
()
{
return
&
forward_cache_
;
}
...
...
@@ -45,6 +63,7 @@ class ConvSearchCache {
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
GetConvFusion
()
{
return
&
fusion_forward_cache_
;
}
#endif
private:
ConvSearchCache
()
{}
...
...
@@ -52,10 +71,17 @@ class ConvSearchCache {
ConvSearchCache
(
const
ConvSearchCache
&
)
{}
ConvSearchCache
&
operator
=
(
const
ConvSearchCache
&
)
{}
#ifdef PADDLE_WITH_HIP
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>
forward_cache_
;
AlgorithmsCache
<
miopenConvBwdDataAlgorithm_t
>
backward_data_cache_
;
AlgorithmsCache
<
miopenConvBwdWeightsAlgorithm_t
>
backward_filter_cache_
;
AlgorithmsCache
<
miopenConvFwdAlgorithm_t
>
fusion_forward_cache_
;
#else
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
forward_cache_
;
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
backward_data_cache_
;
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
backward_filter_cache_
;
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
fusion_forward_cache_
;
#endif
};
}
// namespace framework
...
...
paddle/fluid/framework/copy_same_tensor_test.cc
浏览文件 @
580447d0
...
...
@@ -31,7 +31,7 @@ namespace framework {
static
std
::
vector
<
platform
::
Place
>
CreatePlaceList
()
{
std
::
vector
<
platform
::
Place
>
places
;
places
.
emplace_back
(
platform
::
CPUPlace
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
places
.
emplace_back
(
platform
::
CUDAPlace
(
0
));
#endif
return
places
;
...
...
paddle/fluid/framework/data_feed.cc
浏览文件 @
580447d0
...
...
@@ -151,9 +151,12 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
}
else
{
#ifdef PADDLE_WITH_CUDA
cudaMemcpy
(
dst
,
src
,
size
,
cudaMemcpyHostToDevice
);
#elif defined(PADDLE_WITH_HIP)
hipMemcpy
(
dst
,
src
,
size
,
hipMemcpyHostToDevice
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Not supported GPU, please compile with option WITH_GPU=ON."
));
"Not supported GPU/ROCM, please compile with option WITH_GPU=ON or "
"WITH_ROCM=ON."
));
#endif
}
}
...
...
@@ -1157,7 +1160,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
#endif
}
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
template
<
typename
T
>
void
PrivateInstantDataFeed
<
T
>::
PutToFeedVec
()
{
for
(
size_t
i
=
0
;
i
<
use_slots_
.
size
();
++
i
)
{
...
...
paddle/fluid/framework/data_feed.h
浏览文件 @
580447d0
...
...
@@ -716,7 +716,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
int
pv_batch_size_
;
};
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
template
<
typename
T
>
class
PrivateInstantDataFeed
:
public
DataFeed
{
public:
...
...
paddle/fluid/framework/data_feed_factory.cc
浏览文件 @
580447d0
...
...
@@ -68,7 +68,7 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
REGISTER_DATAFEED_CLASS
(
MultiSlotDataFeed
);
REGISTER_DATAFEED_CLASS
(
MultiSlotInMemoryDataFeed
);
REGISTER_DATAFEED_CLASS
(
PaddleBoxDataFeed
);
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
REGISTER_DATAFEED_CLASS
(
MultiSlotFileInstantDataFeed
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/data_type_transform.cc
浏览文件 @
580447d0
...
...
@@ -47,7 +47,7 @@ struct CastDataType {
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
}
else
if
(
platform
::
is_gpu_place
(
in_
.
place
()))
{
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
ctx_
);
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
580447d0
...
...
@@ -81,7 +81,7 @@ void BroadcastOpHandle::BroadcastOneVar(
});
}
}
else
if
(
platform
::
is_gpu_place
(
in_tensor
.
place
()))
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
VarHandle
*
out_handle
=
nullptr
;
int
root_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
in_tensor
.
place
()).
device
;
...
...
paddle/fluid/framework/details/broadcast_op_handle.h
浏览文件 @
580447d0
...
...
@@ -34,7 +34,7 @@ class Node;
}
// namespace ir
}
// namespace framework
namespace
platform
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
struct
NCCLContextMap
;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
...
...
@@ -43,7 +43,7 @@ struct BKCLContextMap;
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
...
...
@@ -55,7 +55,7 @@ namespace details {
struct
BroadcastOpHandle
:
public
OpHandleBase
{
public:
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
BroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
nccl_ctxs
)
...
...
@@ -106,7 +106,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#elif defined(PADDLE_WITH_XPU_BKCL)
const
platform
::
BKCLContextMap
*
bkcl_ctxs_
;
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.cc
浏览文件 @
580447d0
...
...
@@ -36,7 +36,8 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
test_op
.
TestBroadcastSelectedRows
(
input_scope_idx
);
}
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST
(
BroadcastTester
,
TestGPUBroadcastTestLodTensor
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.h
浏览文件 @
580447d0
...
...
@@ -48,7 +48,7 @@ struct TestBroadcastOpHandle {
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
vector
<
p
::
Place
>
place_list_
;
DeviceType
use_device_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
...
...
@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
...
...
@@ -94,7 +94,7 @@ struct TestBroadcastOpHandle {
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL."
));
#endif
}
else
if
(
use_device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
...
...
@@ -122,7 +122,7 @@ struct TestBroadcastOpHandle {
#if defined(PADDLE_WITH_XPU_BKCL)
bkcl_ctxs_
.
reset
(
nullptr
);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
...
...
@@ -143,7 +143,7 @@ struct TestBroadcastOpHandle {
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
#else
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
580447d0
...
...
@@ -52,7 +52,7 @@ class DeviceContext;
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
...
...
@@ -73,11 +73,12 @@ class PullDenseWorker {
public:
virtual
~
PullDenseWorker
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
param
);
#if
def PADDLE_WITH_CUDA
void
AddStream
(
const
cuda
Stream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
AddStream
(
const
gpu
Stream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
void
AddPlace
(
const
paddle
::
platform
::
Place
place
)
{
places_
.
push_back
(
place
);
}
...
...
@@ -137,8 +138,8 @@ class PullDenseWorker {
float
total_batch_num_
=
0
;
std
::
unordered_map
<
const
Scope
*
,
int
>
scope_to_thread_id_
;
#if
def PADDLE_WITH_CUDA
std
::
vector
<
cuda
Stream_t
>
copy_streams_
;
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
gpu
Stream_t
>
copy_streams_
;
#endif
std
::
vector
<
paddle
::
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
thread_scopes_
;
...
...
@@ -167,9 +168,9 @@ class DeviceWorker {
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{}
virtual
void
ProduceTasks
()
{}
virtual
void
GetXpuOpIndex
()
{}
#if
def PADDLE_WITH_CUDA
virtual
void
SetStream
(
const
cuda
Stream_t
stream
)
{}
virtual
void
SetEvent
(
const
cuda
Event_t
event
)
{}
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
virtual
void
SetStream
(
const
gpu
Stream_t
stream
)
{}
virtual
void
SetEvent
(
const
gpu
Event_t
event
)
{}
#endif
virtual
void
SetNeedDumpField
(
bool
need_dump_field
)
{
need_dump_field_
=
need_dump_field
;
...
...
@@ -437,7 +438,8 @@ class HeterCpuWorker : public HogwildWorker {
};
#endif
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class
HeterBoxWorker
:
public
HogwildWorker
{
public:
...
...
@@ -452,8 +454,8 @@ class HeterBoxWorker : public HogwildWorker {
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
void
ProduceTasks
()
override
;
virtual
void
SetStream
(
const
cuda
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
cuda
Event_t
event
)
{
event_
=
event
;
}
virtual
void
SetStream
(
const
gpu
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
gpu
Event_t
event
)
{
event_
=
event
;
}
virtual
void
TrainFilesWithProfiler
()
{}
void
ResetStat
();
...
...
@@ -515,8 +517,8 @@ class HeterBoxWorker : public HogwildWorker {
std
::
unordered_map
<
uint64_t
,
std
::
unordered_set
<
uint64_t
>>
feasign_set_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
pull_queue_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
push_queue_
;
cuda
Event_t
event_
;
cuda
Stream_t
copy_stream_
;
gpu
Event_t
event_
;
gpu
Stream_t
copy_stream_
;
int
batch_cnt_
{
0
};
std
::
atomic
<
int
>
done_cnt_
{
0
};
...
...
@@ -537,7 +539,8 @@ class HeterBoxWorker : public HogwildWorker {
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class
PSGPUWorker
:
public
HogwildWorker
{
public:
PSGPUWorker
()
{}
...
...
@@ -551,8 +554,8 @@ class PSGPUWorker : public HogwildWorker {
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
void
ProduceTasks
()
override
;
virtual
void
SetStream
(
const
cuda
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
cuda
Event_t
event
)
{
event_
=
event
;
}
virtual
void
SetStream
(
const
gpu
Stream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
gpu
Event_t
event
)
{
event_
=
event
;
}
virtual
void
TrainFilesWithProfiler
()
{}
void
ResetStat
();
...
...
@@ -611,8 +614,8 @@ class PSGPUWorker : public HogwildWorker {
std
::
unordered_map
<
uint64_t
,
std
::
unordered_set
<
uint64_t
>>
feasign_set_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
pull_queue_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
push_queue_
;
cuda
Event_t
event_
;
cuda
Stream_t
copy_stream_
;
gpu
Event_t
event_
;
gpu
Stream_t
copy_stream_
;
int
batch_cnt_
{
0
};
std
::
atomic
<
int
>
done_cnt_
{
0
};
...
...
@@ -633,7 +636,7 @@ class PSGPUWorker : public HogwildWorker {
};
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
class
SectionWorker
:
public
DeviceWorker
{
public:
SectionWorker
()
{}
...
...
paddle/fluid/framework/device_worker_factory.cc
浏览文件 @
580447d0
...
...
@@ -69,15 +69,17 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
REGISTER_DEVICE_WORKER_CLASS
(
HeterCpuWorker
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS
(
HeterBoxWorker
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS
(
PSGPUWorker
);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
REGISTER_DEVICE_WORKER_CLASS
(
SectionWorker
);
#endif
}
// namespace framework
...
...
paddle/fluid/framework/dim_test.cu
浏览文件 @
580447d0
...
...
@@ -34,7 +34,12 @@ TEST(Dim, Equality) {
// construct a Dim on the GPU
thrust
::
device_vector
<
paddle
::
framework
::
Dim
<
2
>>
t
(
2
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
test
,
dim3
(
1
),
dim3
(
1
),
0
,
0
,
thrust
::
raw_pointer_cast
(
t
.
data
()));
#else
test
<<<
1
,
1
>>>
(
thrust
::
raw_pointer_cast
(
t
.
data
()));
#endif
a
=
t
[
0
];
EXPECT_EQ
(
a
[
0
],
5
);
EXPECT_EQ
(
a
[
1
],
6
);
...
...
@@ -55,7 +60,12 @@ TEST(Dim, Equality) {
// dynamic access on GPU
thrust
::
device_vector
<
int64_t
>
r
(
1
);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
dyn_idx_gpu
,
dim3
(
1
),
dim3
(
1
),
0
,
0
,
thrust
::
raw_pointer_cast
(
r
.
data
()));
#else
dyn_idx_gpu
<<<
1
,
1
>>>
(
thrust
::
raw_pointer_cast
(
r
.
data
()));
#endif
int64_t
res
=
r
[
0
];
EXPECT_EQ
(
res
,
6
);
}
...
...
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
580447d0
...
...
@@ -83,7 +83,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPlace
&
place
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::
DLContext
ctx
;
ctx
.
device_type
=
kDLGPU
;
ctx
.
device_id
=
place
.
device
;
...
...
@@ -95,7 +95,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPinnedPlace
&
place
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::
DLContext
ctx
;
ctx
.
device_type
=
kDLCPUPinned
;
ctx
.
device_id
=
0
;
...
...
paddle/fluid/framework/dlpack_tensor_test.cc
浏览文件 @
580447d0
...
...
@@ -103,7 +103,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place,
template
<
typename
T
>
void
TestMainLoop
()
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
vector
<
platform
::
Place
>
places
{
platform
::
CPUPlace
(),
platform
::
CUDAPlace
(
0
),
platform
::
CUDAPinnedPlace
()};
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
580447d0
...
...
@@ -431,7 +431,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
std
::
unique_ptr
<
GarbageCollector
>
gc
;
if
(
!
ctx
->
force_disable_gc_
&&
max_memory_size
>=
0
)
{
if
(
platform
::
is_gpu_place
(
place_
))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
max_memory_size
));
...
...
paddle/fluid/framework/generator.cc
浏览文件 @
580447d0
...
...
@@ -25,7 +25,7 @@ namespace paddle {
namespace
framework
{
const
std
::
shared_ptr
<
Generator
>&
GetDefaultCUDAGenerator
(
int64_t
device_id
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
static
int64_t
num_cuda_devices
=
-
1
;
static
std
::
once_flag
num_devices_init_flag
;
...
...
@@ -157,7 +157,7 @@ uint64_t Generator::Random64() {
std
::
pair
<
uint64_t
,
uint64_t
>
Generator
::
IncrementOffset
(
uint64_t
increament_offset
)
{
uint64_t
cur_offset
=
this
->
state_
.
thread_offset
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
lock_guard
<
std
::
mutex
>
lock
(
this
->
mu_
);
this
->
state_
.
thread_offset
+=
increament_offset
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录