Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a60d93fb
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a60d93fb
编写于
2月 23, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 23, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid framework for rocm (part2), test=develop (#31010)
上级
565354f6
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
144 addition
and
53 deletion
+144
-53
paddle/fluid/framework/fleet/CMakeLists.txt
paddle/fluid/framework/fleet/CMakeLists.txt
+11
-2
paddle/fluid/framework/fleet/box_wrapper.cc
paddle/fluid/framework/fleet/box_wrapper.cc
+1
-1
paddle/fluid/framework/fleet/box_wrapper.cu
paddle/fluid/framework/fleet/box_wrapper.cu
+48
-0
paddle/fluid/framework/fleet/box_wrapper.h
paddle/fluid/framework/fleet/box_wrapper.h
+6
-2
paddle/fluid/framework/fleet/box_wrapper_impl.h
paddle/fluid/framework/fleet/box_wrapper_impl.h
+9
-3
paddle/fluid/framework/fleet/fleet_wrapper.cc
paddle/fluid/framework/fleet/fleet_wrapper.cc
+8
-2
paddle/fluid/framework/fleet/fleet_wrapper.h
paddle/fluid/framework/fleet/fleet_wrapper.h
+3
-3
paddle/fluid/framework/fleet/heter_context.h
paddle/fluid/framework/fleet/heter_context.h
+2
-1
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+10
-6
paddle/fluid/framework/fleet/heter_ps/hashtable.h
paddle/fluid/framework/fleet/heter_ps/hashtable.h
+3
-3
paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+3
-3
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+2
-2
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+9
-9
paddle/fluid/framework/fleet/heter_wrapper.cc
paddle/fluid/framework/fleet/heter_wrapper.cc
+4
-4
paddle/fluid/framework/fleet/heter_wrapper.h
paddle/fluid/framework/fleet/heter_wrapper.h
+2
-2
paddle/fluid/framework/fleet/nccl_wrapper.cc
paddle/fluid/framework/fleet/nccl_wrapper.cc
+13
-5
paddle/fluid/framework/fleet/nccl_wrapper.h
paddle/fluid/framework/fleet/nccl_wrapper.h
+6
-3
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+2
-1
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+2
-1
未找到文件。
paddle/fluid/framework/fleet/CMakeLists.txt
浏览文件 @
a60d93fb
...
...
@@ -4,6 +4,10 @@ if(WITH_PSLIB)
nv_library
(
ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps
)
add_subdirectory
(
heter_ps
)
elseif
(
WITH_RCCL
)
hip_library
(
ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps
)
add_subdirectory
(
heter_ps
)
else
()
cc_library
(
ps_gpu_wrapper SRCS ps_gpu_wrapper.cc
)
endif
(
WITH_NCCL
)
...
...
@@ -12,11 +16,16 @@ else()
cc_library
(
ps_gpu_wrapper SRCS ps_gpu_wrapper.cc
)
endif
(
WITH_PSLIB
)
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
cc_library
(
nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope
)
endif
()
if
(
WITH_BOX_PS
)
nv_library
(
box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps
)
if
(
WITH_GPU
)
nv_library
(
box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps
)
endif
()
if
(
WITH_ROCM
)
hip_library
(
box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps
)
endif
()
else
()
cc_library
(
box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor
)
endif
(
WITH_BOX_PS
)
...
...
paddle/fluid/framework/fleet/box_wrapper.cc
浏览文件 @
a60d93fb
...
...
@@ -25,7 +25,7 @@ namespace paddle {
namespace
framework
{
std
::
shared_ptr
<
BoxWrapper
>
BoxWrapper
::
s_instance_
=
nullptr
;
cuda
Stream_t
BoxWrapper
::
stream_list_
[
8
];
gpu
Stream_t
BoxWrapper
::
stream_list_
[
8
];
std
::
shared_ptr
<
boxps
::
BoxPSBase
>
BoxWrapper
::
boxps_ptr_
=
nullptr
;
AfsManager
*
BoxWrapper
::
afs_manager
=
nullptr
;
int
BoxWrapper
::
embedx_dim_
=
8
;
...
...
paddle/fluid/framework/fleet/box_wrapper.cu
浏览文件 @
a60d93fb
...
...
@@ -142,8 +142,13 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
->
stream
();
auto
buf_value
=
memory
::
AllocShared
(
place
,
values
.
size
()
*
sizeof
(
float
*
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_value
->
ptr
());
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
hipMemcpyHostToDevice
);
#else
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
#endif
#define EMBEDX_CASE(i, ...) \
case i: { \
constexpr size_t EmbedxDim = i; \
...
...
@@ -155,6 +160,19 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
} \
} break
#ifdef PADDLE_WITH_HIP
#define EXPAND_EMBED_PUSH_CASE(i, ...) \
case i: { \
constexpr size_t ExpandDim = i; \
hipLaunchKernelGGL( \
PushCopy<EmbedxDim, ExpandDim>, dim3((total_length + 512 - 1) / 512), \
dim3(512), 0, stream, gpu_values, \
reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>( \
total_values_gpu), \
gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \
gpu_keys); \
} break
#else
#define EXPAND_EMBED_PULL_CASE(i, ...) \
case i: { \
constexpr size_t ExpandDim = i; \
...
...
@@ -166,6 +184,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \
gpu_keys); \
} break
#endif
switch
(
hidden_size
-
3
)
{
EMBEDX_CASE
(
8
,
EXPAND_EMBED_PULL_CASE
(
0
);
EXPAND_EMBED_PULL_CASE
(
8
);
...
...
@@ -187,9 +206,16 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
platform
::
DeviceContextPool
::
Instance
().
Get
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)))
->
stream
();
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL
(
CopyKeysKernel
,
dim3
((
total_len
+
512
-
1
)
/
512
),
dim3
(
512
),
0
,
stream
,
origin_keys
,
total_keys
,
gpu_len
,
slot_num
,
total_len
);
hipStreamSynchronize
(
stream
);
#else
CopyKeysKernel
<<<
(
total_len
+
512
-
1
)
/
512
,
512
,
0
,
stream
>>>
(
origin_keys
,
total_keys
,
gpu_len
,
slot_num
,
total_len
);
cudaStreamSynchronize
(
stream
);
#endif
}
void
BoxWrapper
::
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
...
...
@@ -217,12 +243,21 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
int
*
d_slot_vector
=
reinterpret_cast
<
int
*>
(
buf_slot_vector
->
ptr
());
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
hipMemcpyHostToDevice
);
hipMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
hipMemcpyHostToDevice
);
hipMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
hipMemcpyHostToDevice
);
#else
cudaMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
#endif
#define EMBEDX_CASE(i, ...) \
case i: { \
...
...
@@ -235,6 +270,18 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
} \
} break
#ifdef PADDLE_WITH_HIP
#define EXPAND_EMBED_PUSH_CASE(i, ...) \
case i: { \
constexpr size_t ExpandDim = i; \
hipLaunchKernelGGL(PushCopy<EmbedxDim, ExpandDim>, \
dim3(total_length + 512 - 1) / 512), dim3(512), 0, stream, \
reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
total_grad_values_gpu), \
gpu_values, gpu_len, hidden_size, expand_embed_dim, \
slot_lengths.size(), total_length, batch_size, d_slot_vector); \
} break
#else
#define EXPAND_EMBED_PUSH_CASE(i, ...) \
case i: { \
constexpr size_t ExpandDim = i; \
...
...
@@ -245,6 +292,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
gpu_values, gpu_len, hidden_size, expand_embed_dim, \
slot_lengths.size(), total_length, batch_size, d_slot_vector); \
} break
#endif
switch
(
hidden_size
-
3
)
{
EMBEDX_CASE
(
8
,
EXPAND_EMBED_PUSH_CASE
(
0
);
EXPAND_EMBED_PUSH_CASE
(
8
);
...
...
paddle/fluid/framework/fleet/box_wrapper.h
浏览文件 @
a60d93fb
...
...
@@ -396,7 +396,7 @@ class BoxWrapper {
const
std
::
string
&
model_path
)
{
if
(
nullptr
!=
s_instance_
)
{
VLOG
(
3
)
<<
"Begin InitializeGPU"
;
std
::
vector
<
cuda
Stream_t
*>
stream_list
;
std
::
vector
<
gpu
Stream_t
*>
stream_list
;
for
(
int
i
=
0
;
i
<
platform
::
GetCUDADeviceCount
();
++
i
)
{
VLOG
(
3
)
<<
"before get context i["
<<
i
<<
"]"
;
platform
::
CUDADeviceContext
*
context
=
...
...
@@ -542,8 +542,12 @@ class BoxWrapper {
auto
*
gpu_data
=
gpu_tensor
.
data
<
T
>
();
auto
len
=
gpu_tensor
.
numel
();
data
->
resize
(
len
);
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
data
->
data
(),
gpu_data
,
sizeof
(
T
)
*
len
,
hipMemcpyDeviceToHost
);
#else
cudaMemcpy
(
data
->
data
(),
gpu_data
,
sizeof
(
T
)
*
len
,
cudaMemcpyDeviceToHost
);
#endif
}
static
inline
std
::
pair
<
int
,
int
>
parse_cmatch_rank
(
uint64_t
x
)
{
// first 32 bit store cmatch and second 32 bit store rank
...
...
@@ -819,7 +823,7 @@ class BoxWrapper {
}
private:
static
cuda
Stream_t
stream_list_
[
8
];
static
gpu
Stream_t
stream_list_
[
8
];
static
std
::
shared_ptr
<
boxps
::
BoxPSBase
>
boxps_ptr_
;
boxps
::
PSAgentBase
*
p_agent_
=
nullptr
;
// TODO(hutuxian): magic number, will add a config to specify
...
...
paddle/fluid/framework/fleet/box_wrapper_impl.h
浏览文件 @
a60d93fb
...
...
@@ -43,7 +43,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Warning:: CPUPlace is not supported in PaddleBox now."
));
}
else
if
(
platform
::
is_gpu_place
(
place
))
{
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
VLOG
(
3
)
<<
"Begin copy keys, key_num["
<<
total_length
<<
"]"
;
int
device_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
GetDeviceId
();
LoDTensor
&
total_keys_tensor
=
keys_tensor
[
device_id
];
...
...
@@ -60,11 +60,17 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
memory
::
AllocShared
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
uint64_t
**
gpu_keys
=
reinterpret_cast
<
uint64_t
**>
(
buf_key
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
#ifdef PADDLE_WITH_HIP
hipMemcpy
(
gpu_keys
,
keys
.
data
(),
keys
.
size
()
*
sizeof
(
uint64_t
*
),
hipMemcpyHostToDevice
);
hipMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
hipMemcpyHostToDevice
);
#else
cudaMemcpy
(
gpu_keys
,
keys
.
data
(),
keys
.
size
()
*
sizeof
(
uint64_t
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
#endif
this
->
CopyKeys
(
place
,
gpu_keys
,
total_keys
,
gpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
static_cast
<
int
>
(
total_length
));
...
...
@@ -124,7 +130,7 @@ void BoxWrapper::PushSparseGradCase(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Warning:: CPUPlace is not supported in PaddleBox now."
));
}
else
if
(
platform
::
is_gpu_place
(
place
))
{
#if
defined(PADDLE_WITH_CUDA
) && !defined(_WIN32)
#if
(defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
) && !defined(_WIN32)
int
device_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
GetDeviceId
();
LoDTensor
&
cached_total_keys_tensor
=
keys_tensor
[
device_id
];
uint64_t
*
total_keys
=
...
...
paddle/fluid/framework/fleet/fleet_wrapper.cc
浏览文件 @
a60d93fb
...
...
@@ -698,13 +698,14 @@ void FleetWrapper::PushDenseVarsSync(
Scope
*
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
)
{}
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined PADDLE_WITH_PSLIB)
void
FleetWrapper
::
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
,
cudaStream_t
stream
,
cuda
Event_t
event
)
{
gpuStream_t
stream
,
gpu
Event_t
event
)
{
std
::
vector
<
paddle
::
ps
::
Region
>
regions
;
for
(
auto
&
t
:
var_names
)
{
Variable
*
var
=
scope
.
FindVar
(
t
);
...
...
@@ -719,8 +720,13 @@ void FleetWrapper::PushDenseVarsAsync(
memory
::
Copy
(
platform
::
CUDAPinnedPlace
(),
pin_g
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
g_data
,
sizeof
(
float
)
*
count
,
stream
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventRecord
(
event
,
stream
));
hipEventSynchronize
(
event
);
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
cudaEventSynchronize
(
event
);
#endif
float
*
g
=
pin_g
;
if
(
scale_datanorm
>=
0
)
{
...
...
paddle/fluid/framework/fleet/fleet_wrapper.h
浏览文件 @
a60d93fb
...
...
@@ -152,14 +152,14 @@ class FleetWrapper {
// Push dense variables to server in async mode
// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
// Param<out>: push_sparse_status
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
,
cuda
Stream_t
stream
,
cuda
Event_t
event
);
const
paddle
::
platform
::
Place
&
place
,
gpu
Stream_t
stream
,
gpu
Event_t
event
);
#endif
#ifdef PADDLE_WITH_XPU
void
PushDenseVarsAsync
(
...
...
paddle/fluid/framework/fleet/heter_context.h
浏览文件 @
a60d93fb
...
...
@@ -14,7 +14,8 @@ limitations under the License. */
#pragma once
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include <algorithm>
#include <map>
...
...
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
浏览文件 @
a60d93fb
nv_library
(
heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc
heter_resource.h hashtable.h DEPS cub device_context
)
nv_test
(
test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS
heter_comm
)
nv_library
(
heter_ps SRCS heter_ps.cu DEPS heter_comm
)
IF
(
WITH_GPU
)
nv_library
(
heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context
)
nv_test
(
test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm
)
nv_library
(
heter_ps SRCS heter_ps.cu DEPS heter_comm
)
ENDIF
()
IF
(
WITH_ROCM
)
hip_library
(
heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context
)
hip_test
(
test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm
)
hip_library
(
heter_ps SRCS heter_ps.cu DEPS heter_comm
)
ENDIF
()
paddle/fluid/framework/fleet/heter_ps/hashtable.h
浏览文件 @
a60d93fb
...
...
@@ -45,15 +45,15 @@ class HashTable {
HashTable
(
const
HashTable
&
)
=
delete
;
HashTable
&
operator
=
(
const
HashTable
&
)
=
delete
;
void
insert
(
const
KeyType
*
d_keys
,
const
ValType
*
d_vals
,
size_t
len
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
void
get
(
const
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
void
show
();
void
dump_to_cpu
(
int
devid
,
cudaStream_t
stream
);
template
<
typename
GradType
,
typename
Sgd
>
void
update
(
const
KeyType
*
d_keys
,
const
GradType
*
d_grads
,
size_t
len
,
Sgd
sgd
,
cuda
Stream_t
stream
);
Sgd
sgd
,
gpu
Stream_t
stream
);
private:
TableContainer
<
KeyType
,
ValType
>*
container_
;
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
浏览文件 @
a60d93fb
...
...
@@ -87,7 +87,7 @@ void HashTable<KeyType, ValType>::show() {
template
<
typename
KeyType
,
typename
ValType
>
void
HashTable
<
KeyType
,
ValType
>::
get
(
const
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
,
cuda
Stream_t
stream
)
{
size_t
len
,
gpu
Stream_t
stream
)
{
if
(
len
==
0
)
{
return
;
}
...
...
@@ -99,7 +99,7 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
template
<
typename
KeyType
,
typename
ValType
>
void
HashTable
<
KeyType
,
ValType
>::
insert
(
const
KeyType
*
d_keys
,
const
ValType
*
d_vals
,
size_t
len
,
cuda
Stream_t
stream
)
{
gpu
Stream_t
stream
)
{
if
(
len
==
0
)
{
return
;
}
...
...
@@ -147,7 +147,7 @@ template <typename KeyType, typename ValType>
template
<
typename
GradType
,
typename
Sgd
>
void
HashTable
<
KeyType
,
ValType
>::
update
(
const
KeyType
*
d_keys
,
const
GradType
*
d_grads
,
size_t
len
,
Sgd
sgd
,
cuda
Stream_t
stream
)
{
Sgd
sgd
,
gpu
Stream_t
stream
)
{
if
(
len
==
0
)
{
return
;
}
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
浏览文件 @
a60d93fb
...
...
@@ -25,7 +25,7 @@ __global__ void fill_idx(T* idx, size_t len) {
}
template
<
typename
T
>
void
show_tensor
(
T
*
input
,
size_t
len
,
cuda
Stream_t
stream
,
std
::
string
name
)
{
void
show_tensor
(
T
*
input
,
size_t
len
,
gpu
Stream_t
stream
,
std
::
string
name
)
{
T
tmp
[
len
];
cudaMemcpyAsync
(
&
tmp
,
input
,
sizeof
(
T
)
*
len
,
cudaMemcpyDeviceToHost
,
stream
);
cudaStreamSynchronize
(
stream
);
...
...
@@ -270,7 +270,7 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_key_bufs
;
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_val_bufs
;
cuda
Stream_t
streams
[
stream_num
];
gpu
Stream_t
streams
[
stream_num
];
for
(
int
i
=
0
;
i
<
stream_num
;
++
i
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreate
(
&
(
streams
[
i
])));
auto
d_k_buf
=
memory
::
AllocShared
(
place
,
chunk_size
*
sizeof
(
KeyType
));
...
...
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
浏览文件 @
a60d93fb
...
...
@@ -34,16 +34,16 @@ class GPUResource {
int
dev_id
()
const
{
return
dev_id_
;
}
int
index
()
const
{
return
index_
;
}
cuda
Stream_t
local_stream
(
int
num
)
{
return
local_streams_
[
num
];
}
cuda
Stream_t
remote_stream
()
{
return
remote_stream_
;
}
cuda
Stream_t
comm_stream
(
int
num
)
{
return
comm_streams_
[
num
];
}
gpu
Stream_t
local_stream
(
int
num
)
{
return
local_streams_
[
num
];
}
gpu
Stream_t
remote_stream
()
{
return
remote_stream_
;
}
gpu
Stream_t
comm_stream
(
int
num
)
{
return
comm_streams_
[
num
];
}
int
dev_id_
;
int
index_
;
std
::
vector
<
int
>
dev_ids_
;
cuda
Stream_t
remote_stream_
;
std
::
vector
<
cuda
Stream_t
>
local_streams_
;
std
::
vector
<
cuda
Stream_t
>
comm_streams_
;
gpu
Stream_t
remote_stream_
;
std
::
vector
<
gpu
Stream_t
>
local_streams_
;
std
::
vector
<
gpu
Stream_t
>
comm_streams_
;
};
class
HeterPsResource
{
...
...
@@ -56,9 +56,9 @@ class HeterPsResource {
int
total_gpu
();
int
get_index_by_devid
(
int
devid
);
int
dev_id
(
int
num
);
cuda
Stream_t
local_stream
(
int
gpu_num
,
int
stream_num
);
cuda
Stream_t
remote_stream
(
int
gpu_num
);
cuda
Stream_t
comm_stream
(
int
gpu_num
,
int
stream_num
);
gpu
Stream_t
local_stream
(
int
gpu_num
,
int
stream_num
);
gpu
Stream_t
remote_stream
(
int
gpu_num
);
gpu
Stream_t
comm_stream
(
int
gpu_num
,
int
stream_num
);
std
::
vector
<
std
::
shared_ptr
<
GPUResource
>>
resources_
;
std
::
vector
<
int
>
dev_ids_
;
...
...
paddle/fluid/framework/fleet/heter_wrapper.cc
浏览文件 @
a60d93fb
...
...
@@ -114,7 +114,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
memcpy
(
data_ptr
,
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
}
else
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory
::
Copy
(
platform
::
CPUPlace
(),
data_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tensor
->
place
()),
tensor
->
data
<
void
>
(),
...
...
@@ -129,11 +129,11 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
,
cuda
Stream_t
stream
)
{
gpu
Stream_t
stream
)
{
// const VariableMessage& req_var = request->vars();
auto
*
var
=
scope
->
FindVar
(
req_var
.
varname
());
auto
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
...
...
@@ -157,7 +157,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
void
*
tensor_data
=
tensor
->
mutable_data
(
place
,
ToVarType
(
req_var
.
data_type
()));
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
tensor_data
,
platform
::
CPUPlace
(),
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
stream
);
...
...
paddle/fluid/framework/fleet/heter_wrapper.h
浏览文件 @
a60d93fb
...
...
@@ -86,9 +86,9 @@ class HeterWrapper {
framework
::
proto
::
VarType
::
Type
ToVarType
(
VariableMessage
::
Type
type
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
,
cuda
Stream_t
stream
);
platform
::
Place
place
,
gpu
Stream_t
stream
);
#endif
void
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
);
...
...
paddle/fluid/framework/fleet/nccl_wrapper.cc
浏览文件 @
a60d93fb
...
...
@@ -21,7 +21,7 @@ std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
bool
NCCLWrapper
::
is_initialized_
=
false
;
void
NCCLWrapper
::
InitNCCL
()
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclCommInitRank
(
&
(
nccl_info_
.
comm_
),
nccl_info_
.
global_ranks_
,
nccl_info_
.
nccl_id_
,
nccl_info_
.
my_global_rank_
));
...
...
@@ -30,14 +30,14 @@ void NCCLWrapper::InitNCCL() {
}
void
NCCLWrapper
::
SetNCCLId
(
const
NCCLInfo
&
nccl_info
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
nccl_info_
.
nccl_id_
=
nccl_info
.
nccl_id_
;
#endif
return
;
}
NCCLInfo
NCCLWrapper
::
GetNCCLId
()
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclGetUniqueId
(
&
(
nccl_info_
.
nccl_id_
)));
#endif
...
...
@@ -46,19 +46,23 @@ NCCLInfo NCCLWrapper::GetNCCLId() {
void
NCCLWrapper
::
SetRankInfo
(
const
int
local_rank
,
const
int
global_rank
,
const
int
ranks
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
nccl_info_
.
local_rank_
=
local_rank
;
nccl_info_
.
my_global_rank_
=
global_rank
;
nccl_info_
.
global_ranks_
=
ranks
;
platform
::
SetDeviceId
(
local_rank
);
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamCreate
(
&
(
nccl_info_
.
stream_
)));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreate
(
&
(
nccl_info_
.
stream_
)));
#endif
#endif
return
;
}
void
NCCLWrapper
::
SyncVar
(
const
int
root_rank
,
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
var_names
)
{
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
for
(
auto
&
name
:
var_names
)
{
auto
var
=
scope
.
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
...
...
@@ -66,7 +70,11 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclBcast
(
reinterpret_cast
<
void
*>
(
tensor
->
data
<
float
>
()),
total_size
,
ncclFloat
,
root_rank
,
nccl_info_
.
comm_
,
nccl_info_
.
stream_
));
#ifdef PADDLE_WITH_RCCL
hipStreamSynchronize
(
nccl_info_
.
stream_
);
#else
cudaStreamSynchronize
(
nccl_info_
.
stream_
);
#endif
}
#endif
return
;
...
...
paddle/fluid/framework/fleet/nccl_wrapper.h
浏览文件 @
a60d93fb
...
...
@@ -25,9 +25,12 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h"
#if
defined(PADDLE_WITH_NCCL)
#if
def PADDLE_WITH_NCCL
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace
paddle
{
...
...
@@ -48,10 +51,10 @@ class NCCLInfo {
int
local_rank_
;
int
global_ranks_
;
int
my_global_rank_
;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
ncclUniqueId
nccl_id_
;
ncclComm_t
comm_
;
cuda
Stream_t
stream_
;
gpu
Stream_t
stream_
;
#endif
};
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
浏览文件 @
a60d93fb
...
...
@@ -26,7 +26,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include <algorithm>
#include <deque>
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
浏览文件 @
a60d93fb
...
...
@@ -14,7 +14,8 @@ limitations under the License. */
#pragma once
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include <atomic>
#include <ctime>
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录