Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
133d63fa
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
133d63fa
编写于
5月 18, 2022
作者:
T
Thunderbrook
提交者:
GitHub
5月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix graph hang (#42768)
* fix device_free * fix hang
上级
fa8c755a
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
25 addition
and
11 deletion
+25
-11
paddle/fluid/distributed/ps/table/common_graph_table.cc
paddle/fluid/distributed/ps/table/common_graph_table.cc
+1
-1
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+2
-2
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
.../fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+15
-8
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+7
-0
未找到文件。
paddle/fluid/distributed/ps/table/common_graph_table.cc
浏览文件 @
133d63fa
...
@@ -1441,7 +1441,7 @@ std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int idx,
...
@@ -1441,7 +1441,7 @@ std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int idx,
}
}
for
(
size_t
i
=
0
;
i
<
tasks
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
tasks
.
size
();
i
++
)
{
auto
ids
=
tasks
[
i
].
get
();
auto
ids
=
tasks
[
i
].
get
();
for
(
auto
&
id
:
ids
)
res
[
id
%
slice_num
].
push_back
(
id
);
for
(
auto
&
id
:
ids
)
res
[
(
uint64_t
)(
id
)
%
slice_num
].
push_back
(
id
);
}
}
return
res
;
return
res
;
}
}
...
...
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
浏览文件 @
133d63fa
...
@@ -23,10 +23,10 @@
...
@@ -23,10 +23,10 @@
#ifdef PADDLE_WITH_HETERPS
#ifdef PADDLE_WITH_HETERPS
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
class
GpuPsGraphTable
:
public
HeterComm
<
int64_t
,
int64_t
,
int
>
{
class
GpuPsGraphTable
:
public
HeterComm
<
u
int64_t
,
int64_t
,
int
>
{
public:
public:
GpuPsGraphTable
(
std
::
shared_ptr
<
HeterPsResource
>
resource
,
int
topo_aware
)
GpuPsGraphTable
(
std
::
shared_ptr
<
HeterPsResource
>
resource
,
int
topo_aware
)
:
HeterComm
<
int64_t
,
int64_t
,
int
>
(
1
,
resource
)
{
:
HeterComm
<
u
int64_t
,
int64_t
,
int
>
(
1
,
resource
)
{
load_factor_
=
0.25
;
load_factor_
=
0.25
;
rw_lock
.
reset
(
new
pthread_rwlock_t
());
rw_lock
.
reset
(
new
pthread_rwlock_t
());
gpu_num
=
resource_
->
total_device
();
gpu_num
=
resource_
->
total_device
();
...
...
paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
浏览文件 @
133d63fa
...
@@ -499,7 +499,7 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
...
@@ -499,7 +499,7 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
keys
.
push_back
(
g
.
node_list
[
j
].
node_id
);
keys
.
push_back
(
g
.
node_list
[
j
].
node_id
);
offset
.
push_back
(
j
);
offset
.
push_back
(
j
);
}
}
build_ps
(
i
,
keys
.
data
(),
offset
.
data
(),
keys
.
size
(),
1024
,
8
);
build_ps
(
i
,
(
uint64_t
*
)
keys
.
data
(),
offset
.
data
(),
keys
.
size
(),
1024
,
8
);
gpu_graph_list
[
i
].
node_size
=
g
.
node_size
;
gpu_graph_list
[
i
].
node_size
=
g
.
node_size
;
}
else
{
}
else
{
build_ps
(
i
,
NULL
,
NULL
,
0
,
1024
,
8
);
build_ps
(
i
,
NULL
,
NULL
,
0
,
1024
,
8
);
...
@@ -572,7 +572,8 @@ void GpuPsGraphTable::build_graph_from_cpu(
...
@@ -572,7 +572,8 @@ void GpuPsGraphTable::build_graph_from_cpu(
keys
.
push_back
(
cpu_graph_list
[
i
].
node_list
[
j
].
node_id
);
keys
.
push_back
(
cpu_graph_list
[
i
].
node_list
[
j
].
node_id
);
offset
.
push_back
(
j
);
offset
.
push_back
(
j
);
}
}
build_ps
(
i
,
keys
.
data
(),
offset
.
data
(),
keys
.
size
(),
1024
,
8
);
build_ps
(
i
,
(
uint64_t
*
)(
keys
.
data
()),
offset
.
data
(),
keys
.
size
(),
1024
,
8
);
gpu_graph_list
[
i
].
node_size
=
cpu_graph_list
[
i
].
node_size
;
gpu_graph_list
[
i
].
node_size
=
cpu_graph_list
[
i
].
node_size
;
}
else
{
}
else
{
build_ps
(
i
,
NULL
,
NULL
,
0
,
1024
,
8
);
build_ps
(
i
,
NULL
,
NULL
,
0
,
1024
,
8
);
...
@@ -665,7 +666,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
...
@@ -665,7 +666,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
int
*
d_shard_actual_sample_size_ptr
=
int
*
d_shard_actual_sample_size_ptr
=
reinterpret_cast
<
int
*>
(
d_shard_actual_sample_size
->
ptr
());
reinterpret_cast
<
int
*>
(
d_shard_actual_sample_size
->
ptr
());
split_input_to_shard
(
key
,
d_idx_ptr
,
len
,
d_left_ptr
,
d_right_ptr
,
gpu_id
);
split_input_to_shard
((
uint64_t
*
)(
key
),
d_idx_ptr
,
len
,
d_left_ptr
,
d_right_ptr
,
gpu_id
);
heter_comm_kernel_
->
fill_shard_key
(
d_shard_keys_ptr
,
key
,
d_idx_ptr
,
len
,
heter_comm_kernel_
->
fill_shard_key
(
d_shard_keys_ptr
,
key
,
d_idx_ptr
,
len
,
stream
);
stream
);
...
@@ -708,7 +710,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
...
@@ -708,7 +710,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
sizeof
(
int
)
*
(
shard_len
+
shard_len
%
2
));
sizeof
(
int
)
*
(
shard_len
+
shard_len
%
2
));
// auto& node = path_[gpu_id][i].nodes_[0];
// auto& node = path_[gpu_id][i].nodes_[0];
}
}
walk_to_dest
(
gpu_id
,
total_gpu
,
h_left
,
h_right
,
d_shard_keys_ptr
,
NULL
);
walk_to_dest
(
gpu_id
,
total_gpu
,
h_left
,
h_right
,
(
uint64_t
*
)(
d_shard_keys_ptr
),
NULL
);
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
if
(
h_left
[
i
]
==
-
1
)
{
if
(
h_left
[
i
]
==
-
1
)
{
...
@@ -720,7 +723,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
...
@@ -720,7 +723,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
node
.
in_stream
);
node
.
in_stream
);
cudaStreamSynchronize
(
node
.
in_stream
);
cudaStreamSynchronize
(
node
.
in_stream
);
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
tables_
[
i
]
->
get
(
reinterpret_cast
<
int64_t
*>
(
node
.
key_storage
),
tables_
[
i
]
->
get
(
reinterpret_cast
<
u
int64_t
*>
(
node
.
key_storage
),
reinterpret_cast
<
int64_t
*>
(
node
.
val_storage
),
reinterpret_cast
<
int64_t
*>
(
node
.
val_storage
),
h_right
[
i
]
-
h_left
[
i
]
+
1
,
h_right
[
i
]
-
h_left
[
i
]
+
1
,
resource_
->
remote_stream
(
i
,
gpu_id
));
resource_
->
remote_stream
(
i
,
gpu_id
));
...
@@ -805,7 +808,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
...
@@ -805,7 +808,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
auto
d_shard_actual_sample_size
=
memory
::
Alloc
(
place
,
len
*
sizeof
(
int
));
auto
d_shard_actual_sample_size
=
memory
::
Alloc
(
place
,
len
*
sizeof
(
int
));
int
*
d_shard_actual_sample_size_ptr
=
int
*
d_shard_actual_sample_size_ptr
=
reinterpret_cast
<
int
*>
(
d_shard_actual_sample_size
->
ptr
());
reinterpret_cast
<
int
*>
(
d_shard_actual_sample_size
->
ptr
());
split_input_to_shard
(
key
,
d_idx_ptr
,
len
,
d_left_ptr
,
d_right_ptr
,
gpu_id
);
split_input_to_shard
((
uint64_t
*
)(
key
),
d_idx_ptr
,
len
,
d_left_ptr
,
d_right_ptr
,
gpu_id
);
heter_comm_kernel_
->
fill_shard_key
(
d_shard_keys_ptr
,
key
,
d_idx_ptr
,
len
,
heter_comm_kernel_
->
fill_shard_key
(
d_shard_keys_ptr
,
key
,
d_idx_ptr
,
len
,
stream
);
stream
);
...
@@ -824,7 +830,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
...
@@ -824,7 +830,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
shard_len
*
(
1
+
sample_size
)
*
sizeof
(
int64_t
)
+
shard_len
*
(
1
+
sample_size
)
*
sizeof
(
int64_t
)
+
sizeof
(
int
)
*
(
shard_len
+
shard_len
%
2
));
sizeof
(
int
)
*
(
shard_len
+
shard_len
%
2
));
}
}
walk_to_dest
(
gpu_id
,
total_gpu
,
h_left
,
h_right
,
d_shard_keys_ptr
,
NULL
);
walk_to_dest
(
gpu_id
,
total_gpu
,
h_left
,
h_right
,
(
uint64_t
*
)(
d_shard_keys_ptr
),
NULL
);
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
if
(
h_left
[
i
]
==
-
1
)
{
if
(
h_left
[
i
]
==
-
1
)
{
...
@@ -837,7 +844,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
...
@@ -837,7 +844,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
cudaStreamSynchronize
(
node
.
in_stream
);
cudaStreamSynchronize
(
node
.
in_stream
);
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
// If not found, val is -1.
// If not found, val is -1.
tables_
[
i
]
->
get
(
reinterpret_cast
<
int64_t
*>
(
node
.
key_storage
),
tables_
[
i
]
->
get
(
reinterpret_cast
<
u
int64_t
*>
(
node
.
key_storage
),
reinterpret_cast
<
int64_t
*>
(
node
.
val_storage
),
reinterpret_cast
<
int64_t
*>
(
node
.
val_storage
),
h_right
[
i
]
-
h_left
[
i
]
+
1
,
h_right
[
i
]
-
h_left
[
i
]
+
1
,
resource_
->
remote_stream
(
i
,
gpu_id
));
resource_
->
remote_stream
(
i
,
gpu_id
));
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
浏览文件 @
133d63fa
...
@@ -320,6 +320,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
...
@@ -320,6 +320,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>;
template
class
HashTable
<
unsigned
long
,
paddle
::
framework
::
FeatureValue
>;
template
class
HashTable
<
long
,
int
>;
template
class
HashTable
<
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
int
>;
template
class
HashTable
<
unsigned
long
,
unsigned
long
>;
template
class
HashTable
<
long
,
long
>;
template
class
HashTable
<
long
,
long
>;
template
class
HashTable
<
long
,
unsigned
long
>;
template
class
HashTable
<
long
,
unsigned
long
>;
template
class
HashTable
<
long
,
unsigned
int
>;
template
class
HashTable
<
long
,
unsigned
int
>;
...
@@ -333,6 +335,8 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
...
@@ -333,6 +335,8 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
int
*
d_vals
,
size_t
len
,
int
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
int
>
::
get
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
int
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
unsigned
long
>
::
get
<
cudaStream_t
>
(
template
void
HashTable
<
long
,
unsigned
long
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
unsigned
long
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
const
long
*
d_keys
,
unsigned
long
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
long
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
template
void
HashTable
<
long
,
long
>
::
get
<
cudaStream_t
>
(
const
long
*
d_keys
,
...
@@ -359,6 +363,9 @@ template void HashTable<long, long>::insert<cudaStream_t>(const long* d_keys,
...
@@ -359,6 +363,9 @@ template void HashTable<long, long>::insert<cudaStream_t>(const long* d_keys,
size_t
len
,
size_t
len
,
cudaStream_t
stream
);
cudaStream_t
stream
);
template
void
HashTable
<
unsigned
long
,
int
>
::
insert
<
cudaStream_t
>
(
const
unsigned
long
*
d_keys
,
const
int
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
template
void
HashTable
<
long
,
unsigned
long
>
::
insert
<
cudaStream_t
>
(
template
void
HashTable
<
long
,
unsigned
long
>
::
insert
<
cudaStream_t
>
(
const
long
*
d_keys
,
const
unsigned
long
*
d_vals
,
size_t
len
,
const
long
*
d_keys
,
const
unsigned
long
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
cudaStream_t
stream
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录