Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
27a5f52b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
27a5f52b
编写于
11月 29, 2021
作者:
T
Thunderbrook
提交者:
GitHub
11月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[HeterPs] fix allocation (#37476)
* auc temp * cuballocator * code format * code format
上级
5b962bd9
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
56 addition
and
29 deletion
+56
-29
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+6
-4
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+50
-25
未找到文件。
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
浏览文件 @
27a5f52b
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <thread>
#include <vector>
#include "cub/cub.cuh"
#include "cub/util_allocator.cuh"
#include "hashtable.h"
#include "heter_resource.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
...
...
@@ -163,9 +164,9 @@ class HeterComm {
};
void
init_path
();
void
create_storage
(
int
start_index
,
int
end_index
,
int
keylen
,
int
vallen
,
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>&
local_strorage
);
void
create_storage
(
int
start_index
,
int
end_index
,
int
keylen
,
int
vallen
);
void
destroy_storage
(
int
start_index
,
int
end_index
);
void
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
GradType
*
src_val
);
void
walk_to_src
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
...
...
@@ -178,7 +179,7 @@ class HeterComm {
std
::
vector
<
Table
*>
tables_
;
std
::
shared_ptr
<
HeterPsResource
>
resource_
;
CustomGradMerger
merger_
;
int
topo_aware_
{
1
};
int
topo_aware_
{
0
};
std
::
vector
<
std
::
vector
<
Path
>>
path_
;
std
::
vector
<
LocalStorage
>
storage_
;
int
feanum_
{
1800
*
2048
};
...
...
@@ -186,6 +187,7 @@ class HeterComm {
std
::
vector
<
ncclComm_t
>
nccl_inner_comms_
;
std
::
vector
<
ncclComm_t
>
nccl_inter_comms_
;
int
node_size_
;
std
::
vector
<
std
::
shared_ptr
<
cub
::
CachingDeviceAllocator
>>
allocators_
;
};
}
// end namespace framework
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
浏览文件 @
27a5f52b
...
...
@@ -100,6 +100,8 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
storage_
.
resize
(
resource_
->
total_gpu
());
for
(
int
i
=
0
;
i
<
resource_
->
total_gpu
();
++
i
)
{
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
allocators_
.
push_back
(
std
::
make_shared
<
cub
::
CachingDeviceAllocator
>
(
8
,
1
,
(
unsigned
int
)
-
1
,
(
size_t
)
-
1
,
false
,
false
));
auto
table
=
new
Table
(
capacity
/
load_factor_
);
tables_
.
push_back
(
table
);
if
(
multi_node_
)
{
...
...
@@ -115,14 +117,14 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
path_
.
resize
(
total_gpu
);
if
(
!
topo_aware_
)
{
VLOG
(
3
)
<<
"init path without topo aware"
;
VLOG
(
0
)
<<
"init path without topo aware"
;
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
path_
[
i
].
resize
(
total_gpu
);
for
(
int
j
=
0
;
j
<
total_gpu
;
++
j
)
{
auto
&
nodes
=
path_
[
i
][
j
].
nodes_
;
nodes
.
resize
(
1
);
nodes
[
0
].
in_stream
=
resource_
->
comm_stream
(
i
,
j
);
nodes
[
0
].
out_stream
=
resource_
->
comm_stream
(
j
,
i
);
nodes
[
0
].
out_stream
=
resource_
->
comm_stream
(
i
,
j
);
nodes
[
0
].
key_storage
=
NULL
;
nodes
[
0
].
val_storage
=
NULL
;
nodes
[
0
].
sync
=
0
;
...
...
@@ -130,7 +132,7 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
}
}
}
else
{
VLOG
(
3
)
<<
"init path with topo aware"
;
VLOG
(
0
)
<<
"init path with topo aware"
;
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
path_
[
i
].
resize
(
total_gpu
);
for
(
int
j
=
0
;
j
<
total_gpu
;
++
j
)
{
...
...
@@ -163,26 +165,41 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
create_storage
(
int
start_index
,
int
end_index
,
int
keylen
,
int
vallen
,
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>&
local_storage
)
{
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
create_storage
(
int
start_index
,
int
end_index
,
int
keylen
,
int
vallen
)
{
auto
&
allocator
=
allocators_
[
start_index
];
auto
&
nodes
=
path_
[
start_index
][
end_index
].
nodes_
;
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
nodes
[
i
].
gpu_num
));
platform
::
CUDAPlace
remote_place
=
platform
::
CUDAPlace
(
resource_
->
dev_id
(
nodes
[
i
].
gpu_num
));
auto
key_mem
=
memory
::
AllocShared
(
remote_place
,
keylen
);
local_storage
.
push_back
(
key_mem
);
nodes
[
i
].
key_storage
=
reinterpret_cast
<
char
*>
(
key_mem
->
ptr
());
auto
val_mem
=
memory
::
AllocShared
(
remote_place
,
vallen
);
local_storage
.
push_back
(
val_mem
);
nodes
[
i
].
val_storage
=
reinterpret_cast
<
char
*>
(
val_mem
->
ptr
());
allocator
->
DeviceAllocate
(
resource_
->
dev_id
(
nodes
[
i
].
gpu_num
),
(
void
**
)
&
(
nodes
[
i
].
key_storage
),
keylen
,
resource_
->
remote_stream
(
nodes
[
i
].
gpu_num
,
start_index
));
allocator
->
DeviceAllocate
(
resource_
->
dev_id
(
nodes
[
i
].
gpu_num
),
(
void
**
)
&
(
nodes
[
i
].
val_storage
),
vallen
,
resource_
->
remote_stream
(
nodes
[
i
].
gpu_num
,
start_index
));
nodes
[
i
].
key_bytes_len
=
keylen
;
nodes
[
i
].
val_bytes_len
=
vallen
;
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
destroy_storage
(
int
start_index
,
int
end_index
)
{
auto
&
allocator
=
allocators_
[
start_index
];
auto
&
nodes
=
path_
[
start_index
][
end_index
].
nodes_
;
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
nodes
[
i
].
gpu_num
));
allocator
->
DeviceFree
(
resource_
->
dev_id
(
nodes
[
i
].
gpu_num
),
nodes
[
i
].
key_storage
);
allocator
->
DeviceFree
(
resource_
->
dev_id
(
nodes
[
i
].
gpu_num
),
nodes
[
i
].
val_storage
);
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
walk_to_dest
(
int
start_index
,
int
gpu_num
,
int
*
h_left
,
int
*
h_right
,
KeyType
*
src_key
,
...
...
@@ -482,8 +499,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
int
*
d_left_ptr
=
reinterpret_cast
<
int
*>
(
d_left
->
ptr
());
int
*
d_right_ptr
=
reinterpret_cast
<
int
*>
(
d_right
->
ptr
());
cudaMemset
(
d_left_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
)
);
cudaMemset
(
d_right_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
)
);
cudaMemset
Async
(
d_left_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
),
stream
);
cudaMemset
Async
(
d_right_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
),
stream
);
//
auto
d_idx
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
int
));
int
*
d_idx_ptr
=
reinterpret_cast
<
int
*>
(
d_idx
->
ptr
());
...
...
@@ -505,15 +522,13 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
cudaMemcpy
(
h_right
,
d_right_ptr
,
total_gpu
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
);
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
local_storage
;
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
int
shard_len
=
h_right
[
i
]
-
h_left
[
i
]
+
1
;
if
(
shard_len
==
0
)
{
continue
;
}
create_storage
(
num
,
i
,
shard_len
*
sizeof
(
KeyType
),
shard_len
*
sizeof
(
ValType
)
,
local_storage
);
shard_len
*
sizeof
(
ValType
));
}
walk_to_dest
(
num
,
total_gpu
,
h_left
,
h_right
,
d_shard_keys_ptr
,
NULL
);
...
...
@@ -533,6 +548,9 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
cudaStreamSynchronize
(
resource_
->
remote_stream
(
i
,
num
));
if
(
h_left
[
i
]
==
-
1
)
{
continue
;
}
tables_
[
i
]
->
rwlock_
->
UNLock
();
}
...
...
@@ -546,6 +564,9 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
fill_dvals
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_vals_ptr
,
d_vals
,
d_idx_ptr
,
len
);
cudaStreamSynchronize
(
stream
);
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
destroy_storage
(
num
,
i
);
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
...
...
@@ -572,8 +593,8 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
int
*
d_left_ptr
=
reinterpret_cast
<
int
*>
(
d_left
->
ptr
());
int
*
d_right_ptr
=
reinterpret_cast
<
int
*>
(
d_right
->
ptr
());
cudaMemset
(
d_left_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
)
);
cudaMemset
(
d_right_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
)
);
cudaMemset
Async
(
d_left_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
),
stream
);
cudaMemset
Async
(
d_right_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
),
stream
);
//
auto
d_idx
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
int
));
int
*
d_idx_ptr
=
reinterpret_cast
<
int
*>
(
d_idx
->
ptr
());
...
...
@@ -603,14 +624,13 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
cudaMemcpy
(
h_right
,
d_right_ptr
,
total_gpu
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
);
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
local_storage
;
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
int
shard_len
=
h_right
[
i
]
-
h_left
[
i
]
+
1
;
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
}
create_storage
(
gpu_num
,
i
,
shard_len
*
sizeof
(
KeyType
),
shard_len
*
sizeof
(
GradType
)
,
local_storage
);
shard_len
*
sizeof
(
GradType
));
}
walk_to_dest
(
gpu_num
,
total_gpu
,
h_left
,
h_right
,
d_shard_keys_ptr
,
...
...
@@ -632,7 +652,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
cudaStreamSynchronize
(
resource_
->
remote_stream
(
i
,
gpu_num
));
tables_
[
i
]
->
rwlock_
->
UNLock
();
if
(
h_left
[
i
]
!=
-
1
)
{
tables_
[
i
]
->
rwlock_
->
UNLock
();
}
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
destroy_storage
(
gpu_num
,
i
);
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录