Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
539d7185
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
539d7185
编写于
7月 26, 2021
作者:
D
danleifeng
提交者:
GitHub
7月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
【HETERPS】edit cuda remote_streams (#34276)
* psgpu:edit cuda remote_streams; test=develop
上级
ca174025
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
32 addition
and
19 deletion
+32
-19
paddle/fluid/framework/fleet/heter_ps/hashtable.h
paddle/fluid/framework/fleet/heter_ps/hashtable.h
+4
-1
paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+1
-0
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+13
-6
paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+8
-6
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+3
-3
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+3
-3
未找到文件。
paddle/fluid/framework/fleet/heter_ps/hashtable.h
浏览文件 @
539d7185
...
@@ -23,8 +23,9 @@ limitations under the License. */
...
@@ -23,8 +23,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_PSCORE
#ifdef PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
#endif
#endif
#include "paddle/fluid/framework/rw_lock.h"
#include "thrust/pair.h"
#include "thrust/pair.h"
//#include "cudf/concurrent_unordered_map.cuh.h"
//
#include "cudf/concurrent_unordered_map.cuh.h"
#include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
#include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
#ifdef PADDLE_WITH_HETERPS
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/platform/type_defs.h"
#include "paddle/fluid/platform/type_defs.h"
...
@@ -63,6 +64,8 @@ class HashTable {
...
@@ -63,6 +64,8 @@ class HashTable {
int
size
()
{
return
container_
->
size
();
}
int
size
()
{
return
container_
->
size
();
}
std
::
unique_ptr
<
RWLock
>
rwlock_
{
nullptr
};
private:
private:
TableContainer
<
KeyType
,
ValType
>*
container_
;
TableContainer
<
KeyType
,
ValType
>*
container_
;
int
BLOCK_SIZE_
{
256
};
int
BLOCK_SIZE_
{
256
};
...
...
paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
浏览文件 @
539d7185
...
@@ -73,6 +73,7 @@ __global__ void update_kernel(Table* table,
...
@@ -73,6 +73,7 @@ __global__ void update_kernel(Table* table,
template
<
typename
KeyType
,
typename
ValType
>
template
<
typename
KeyType
,
typename
ValType
>
HashTable
<
KeyType
,
ValType
>::
HashTable
(
size_t
capacity
)
{
HashTable
<
KeyType
,
ValType
>::
HashTable
(
size_t
capacity
)
{
container_
=
new
TableContainer
<
KeyType
,
ValType
>
(
capacity
);
container_
=
new
TableContainer
<
KeyType
,
ValType
>
(
capacity
);
rwlock_
.
reset
(
new
RWLock
);
}
}
template
<
typename
KeyType
,
typename
ValType
>
template
<
typename
KeyType
,
typename
ValType
>
...
...
paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
浏览文件 @
539d7185
...
@@ -525,12 +525,15 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
...
@@ -525,12 +525,15 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
auto
&
node
=
path_
[
num
][
i
].
nodes_
.
back
();
auto
&
node
=
path_
[
num
][
i
].
nodes_
.
back
();
cudaStreamSynchronize
(
node
.
in_stream
);
cudaStreamSynchronize
(
node
.
in_stream
);
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
tables_
[
i
]
->
rwlock_
->
RDLock
();
tables_
[
i
]
->
get
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
tables_
[
i
]
->
get
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
reinterpret_cast
<
ValType
*>
(
node
.
val_storage
),
reinterpret_cast
<
ValType
*>
(
node
.
val_storage
),
h_right
[
i
]
-
h_left
[
i
]
+
1
,
resource_
->
remote_stream
(
i
));
h_right
[
i
]
-
h_left
[
i
]
+
1
,
resource_
->
remote_stream
(
i
,
num
));
}
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
cudaStreamSynchronize
(
resource_
->
remote_stream
(
i
));
cudaStreamSynchronize
(
resource_
->
remote_stream
(
i
,
num
));
tables_
[
i
]
->
rwlock_
->
UNLock
();
}
}
walk_to_src
(
num
,
total_gpu
,
h_left
,
h_right
,
d_shard_vals_ptr
);
walk_to_src
(
num
,
total_gpu
,
h_left
,
h_right
,
d_shard_vals_ptr
);
...
@@ -621,13 +624,15 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
...
@@ -621,13 +624,15 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
cudaStreamSynchronize
(
node
.
in_stream
);
cudaStreamSynchronize
(
node
.
in_stream
);
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
tables_
[
i
]
->
rwlock_
->
WRLock
();
tables_
[
i
]
->
update
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
tables_
[
i
]
->
update
(
reinterpret_cast
<
KeyType
*>
(
node
.
key_storage
),
reinterpret_cast
<
GradType
*>
(
node
.
val_storage
),
reinterpret_cast
<
GradType
*>
(
node
.
val_storage
),
h_right
[
i
]
-
h_left
[
i
]
+
1
,
sgd
,
h_right
[
i
]
-
h_left
[
i
]
+
1
,
sgd
,
resource_
->
remote_stream
(
i
));
resource_
->
remote_stream
(
i
,
gpu_num
));
}
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
cudaStreamSynchronize
(
resource_
->
remote_stream
(
i
));
cudaStreamSynchronize
(
resource_
->
remote_stream
(
i
,
gpu_num
));
tables_
[
i
]
->
rwlock_
->
UNLock
();
}
}
}
}
...
@@ -641,9 +646,11 @@ void HeterComm<KeyType, ValType, GradType>::update_one_table(
...
@@ -641,9 +646,11 @@ void HeterComm<KeyType, ValType, GradType>::update_one_table(
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
tables_
[
gpu_num
]
->
rwlock_
->
WRLock
();
tables_
[
gpu_num
]
->
update
(
d_keys
,
d_grads
,
len
,
sgd
,
tables_
[
gpu_num
]
->
update
(
d_keys
,
d_grads
,
len
,
sgd
,
resource_
->
remote_stream
(
gpu_num
));
resource_
->
remote_stream
(
gpu_num
,
gpu_num
));
cudaStreamSynchronize
(
resource_
->
remote_stream
(
gpu_num
));
tables_
[
gpu_num
]
->
rwlock_
->
UNLock
();
cudaStreamSynchronize
(
resource_
->
remote_stream
(
gpu_num
,
gpu_num
));
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
...
...
paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
浏览文件 @
539d7185
...
@@ -27,16 +27,16 @@ GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
...
@@ -27,16 +27,16 @@ GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
platform
::
CUDADeviceGuard
guard
(
dev_id_
);
platform
::
CUDADeviceGuard
guard
(
dev_id_
);
local_streams_
.
resize
(
dev_ids_
.
size
());
local_streams_
.
resize
(
dev_ids_
.
size
());
comm_streams_
.
resize
(
dev_ids_
.
size
());
comm_streams_
.
resize
(
dev_ids_
.
size
());
remote_streams_
.
resize
(
dev_ids_
.
size
());
for
(
size_t
i
=
0
;
i
<
dev_ids_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
dev_ids_
.
size
();
++
i
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreateWithFlags
(
&
local_streams_
[
i
],
cudaStreamNonBlocking
));
cudaStreamCreateWithFlags
(
&
local_streams_
[
i
],
cudaStreamNonBlocking
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreateWithFlags
(
&
comm_streams_
[
i
],
cudaStreamNonBlocking
));
cudaStreamCreateWithFlags
(
&
comm_streams_
[
i
],
cudaStreamNonBlocking
));
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreateWithFlags
(
&
remote_stream_
,
cudaStreamNonBlocking
));
cudaStreamCreateWithFlags
(
&
remote_streams_
[
i
],
cudaStreamNonBlocking
));
}
}
}
GPUResource
::~
GPUResource
()
{
GPUResource
::~
GPUResource
()
{
...
@@ -47,7 +47,9 @@ GPUResource::~GPUResource() {
...
@@ -47,7 +47,9 @@ GPUResource::~GPUResource() {
for
(
size_t
i
=
0
;
i
<
comm_streams_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
comm_streams_
.
size
();
++
i
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
comm_streams_
[
i
]));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
comm_streams_
[
i
]));
}
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
remote_stream_
));
for
(
size_t
i
=
0
;
i
<
remote_streams_
.
size
();
++
i
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
remote_streams_
[
i
]));
}
}
}
void
HeterPsResource
::
enable_p2p
()
{
void
HeterPsResource
::
enable_p2p
()
{
...
@@ -90,8 +92,8 @@ cudaStream_t HeterPsResource::local_stream(int gpu_num, int stream_num) {
...
@@ -90,8 +92,8 @@ cudaStream_t HeterPsResource::local_stream(int gpu_num, int stream_num) {
return
resources_
[
gpu_num
]
->
local_stream
(
stream_num
);
return
resources_
[
gpu_num
]
->
local_stream
(
stream_num
);
}
}
cudaStream_t
HeterPsResource
::
remote_stream
(
int
gpu_num
)
{
cudaStream_t
HeterPsResource
::
remote_stream
(
int
gpu_num
,
int
stream_num
)
{
return
resources_
[
gpu_num
]
->
remote_stream
();
return
resources_
[
gpu_num
]
->
remote_stream
(
stream_num
);
}
}
int
HeterPsResource
::
dev_id
(
int
num
)
{
return
dev_ids_
[
num
];
}
int
HeterPsResource
::
dev_id
(
int
num
)
{
return
dev_ids_
[
num
];
}
...
...
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
浏览文件 @
539d7185
...
@@ -35,13 +35,13 @@ class GPUResource {
...
@@ -35,13 +35,13 @@ class GPUResource {
int
dev_id
()
const
{
return
dev_id_
;
}
int
dev_id
()
const
{
return
dev_id_
;
}
int
index
()
const
{
return
index_
;
}
int
index
()
const
{
return
index_
;
}
gpuStream_t
local_stream
(
int
num
)
{
return
local_streams_
[
num
];
}
gpuStream_t
local_stream
(
int
num
)
{
return
local_streams_
[
num
];
}
gpuStream_t
remote_stream
(
)
{
return
remote_stream_
;
}
gpuStream_t
remote_stream
(
int
num
)
{
return
remote_streams_
[
num
]
;
}
gpuStream_t
comm_stream
(
int
num
)
{
return
comm_streams_
[
num
];
}
gpuStream_t
comm_stream
(
int
num
)
{
return
comm_streams_
[
num
];
}
int
dev_id_
;
int
dev_id_
;
int
index_
;
int
index_
;
std
::
vector
<
int
>
dev_ids_
;
std
::
vector
<
int
>
dev_ids_
;
gpuStream_t
remote_stream
_
;
std
::
vector
<
gpuStream_t
>
remote_streams
_
;
std
::
vector
<
gpuStream_t
>
local_streams_
;
std
::
vector
<
gpuStream_t
>
local_streams_
;
std
::
vector
<
gpuStream_t
>
comm_streams_
;
std
::
vector
<
gpuStream_t
>
comm_streams_
;
};
};
...
@@ -57,7 +57,7 @@ class HeterPsResource {
...
@@ -57,7 +57,7 @@ class HeterPsResource {
int
get_index_by_devid
(
int
devid
);
int
get_index_by_devid
(
int
devid
);
int
dev_id
(
int
num
);
int
dev_id
(
int
num
);
gpuStream_t
local_stream
(
int
gpu_num
,
int
stream_num
);
gpuStream_t
local_stream
(
int
gpu_num
,
int
stream_num
);
gpuStream_t
remote_stream
(
int
gpu_num
);
gpuStream_t
remote_stream
(
int
gpu_num
,
int
stream_num
);
gpuStream_t
comm_stream
(
int
gpu_num
,
int
stream_num
);
gpuStream_t
comm_stream
(
int
gpu_num
,
int
stream_num
);
std
::
vector
<
std
::
shared_ptr
<
GPUResource
>>
resources_
;
std
::
vector
<
std
::
shared_ptr
<
GPUResource
>>
resources_
;
...
...
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
浏览文件 @
539d7185
...
@@ -121,7 +121,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
...
@@ -121,7 +121,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpyHostToDevice
);
PullCopy
<<<
(
total_length
+
512
-
1
)
/
512
,
512
,
0
,
stream
>>>
(
PullCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
gpu_values
,
total_values_gpu
,
gpu_len
,
hidden_size
,
slot_num
,
gpu_values
,
total_values_gpu
,
gpu_len
,
hidden_size
,
slot_num
,
total_length
,
gpu_keys
);
total_length
,
gpu_keys
);
cudaStreamSynchronize
(
stream
);
cudaStreamSynchronize
(
stream
);
...
@@ -135,7 +135,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
...
@@ -135,7 +135,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)))
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)))
->
stream
();
->
stream
();
CopyKeysKernel
<<<
(
total_len
+
512
-
1
)
/
512
,
512
,
0
,
stream
>>>
(
CopyKeysKernel
<<<
(
total_len
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
origin_keys
,
total_keys
,
gpu_len
,
slot_num
,
total_len
);
origin_keys
,
total_keys
,
gpu_len
,
slot_num
,
total_len
);
cudaStreamSynchronize
(
stream
);
cudaStreamSynchronize
(
stream
);
}
}
...
@@ -173,7 +173,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
...
@@ -173,7 +173,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
PushCopy
<<<
(
total_length
+
512
-
1
)
/
512
,
512
,
0
,
stream
>>>
(
PushCopy
<<<
(
total_length
+
1024
-
1
)
/
1024
,
1024
,
0
,
stream
>>>
(
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
hidden_size
,
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
hidden_size
,
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
);
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
);
cudaStreamSynchronize
(
stream
);
cudaStreamSynchronize
(
stream
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录