Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
df14dbf0
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
df14dbf0
编写于
11月 23, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
11月 23, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[fleet_executor] Update with collective (#37462)
上级
38f1ef50
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
60 addition
and
11 deletion
+60
-11
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+3
-2
paddle/fluid/distributed/fleet_executor/message_bus.cc
paddle/fluid/distributed/fleet_executor/message_bus.cc
+34
-8
paddle/fluid/platform/gen_comm_id_helper.cc
paddle/fluid/platform/gen_comm_id_helper.cc
+10
-0
paddle/fluid/platform/gen_comm_id_helper.h
paddle/fluid/platform/gen_comm_id_helper.h
+13
-1
未找到文件。
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
浏览文件 @
df14dbf0
...
...
@@ -12,7 +12,8 @@ endif()
cc_library
(
fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
interceptor.cc compute_interceptor.cc interceptor_message_service.cc message_bus.cc
DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto
${
BRPC_DEPS
}
)
DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper
${
BRPC_DEPS
}
)
if
(
WITH_DISTRIBUTE
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
...
...
paddle/fluid/distributed/fleet_executor/message_bus.cc
浏览文件 @
df14dbf0
...
...
@@ -12,11 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include <memory>
#include <thread>
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
namespace
paddle
{
namespace
distributed
{
...
...
@@ -32,6 +35,21 @@ void MessageBus::Init(
rank_to_addr_
=
rank_to_addr
;
addr_
=
addr
;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
// NOTE: To make the brpc is compatible with collective,
// need release the handler holding the ip address.
if
(
addr_
!=
""
)
{
VLOG
(
3
)
<<
"Message bus is releasing the fd held by gen_comm_id."
;
paddle
::
platform
::
SocketServer
&
socket_server
=
paddle
::
platform
::
SocketServer
::
GetInstance
(
addr_
);
int
server_fd
=
socket_server
.
socket
();
if
(
server_fd
!=
-
1
)
{
socket_server
.
Release
();
}
}
#endif
ListenPort
();
std
::
call_once
(
once_flag_
,
[]()
{
...
...
@@ -87,7 +105,7 @@ bool MessageBus::Send(const InterceptorMessage& interceptor_message) {
void
MessageBus
::
ListenPort
()
{
if
(
addr_
==
""
)
{
VLOG
(
3
)
<<
"No need listen to port since training on single card."
;
LOG
(
INFO
)
<<
"No need listen to port since training on single card."
;
return
;
}
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
...
...
@@ -103,12 +121,20 @@ void MessageBus::ListenPort() {
const
char
*
ip_for_brpc
=
addr_
.
c_str
();
brpc
::
ServerOptions
options
;
options
.
idle_timeout_sec
=
-
1
;
PADDLE_ENFORCE_EQ
(
server_
.
Start
(
ip_for_brpc
,
&
options
),
0
,
platform
::
errors
::
Unavailable
(
"Message bus: start brpc service error."
));
VLOG
(
3
)
<<
"Message bus's listen port thread starts successful."
;
int
retry_times
=
0
;
int
interval
=
1000
;
while
(
server_
.
Start
(
ip_for_brpc
,
&
options
)
!=
0
)
{
++
retry_times
;
LOG
(
INFO
)
<<
"Message bus is retring for starting brpc for "
<<
retry_times
<<
" times. And will retry after "
<<
interval
/
1000
<<
" seconds."
;
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
interval
));
interval
+=
2000
;
}
LOG
(
INFO
)
<<
"Message bus's listen port thread starts successful."
;
#else
VLOG
(
3
)
<<
"Fleet executor's ListenPort() is a fake function when Paddle is "
LOG
(
WARNING
)
<<
"Fleet executor's ListenPort() is a fake function when Paddle is "
"compiled with npu or Paddle isn't compiled "
"with distributed for now."
;
#endif
...
...
paddle/fluid/platform/gen_comm_id_helper.cc
浏览文件 @
df14dbf0
...
...
@@ -153,6 +153,16 @@ int CreateListenSocket(const std::string& ep) {
// not enter the TIME-WAIT state. But this is obviously not as convenient
// as the reuse method.
int
opt
=
1
;
// NOTE. The linger is used for skipping TIME-WAIT status forcefully.
linger
ling
;
ling
.
l_onoff
=
1
;
ling
.
l_linger
=
0
;
CHECK_SYS_CALL
(
setsockopt
(
server_fd
,
SOL_SOCKET
,
SO_LINGER
,
&
ling
,
sizeof
(
ling
)),
"setsockopt set linger"
);
#if defined(SO_REUSEPORT)
// since Linux kernel 3.9
CHECK_SYS_CALL
(
setsockopt
(
server_fd
,
SOL_SOCKET
,
SO_REUSEADDR
|
SO_REUSEPORT
,
...
...
paddle/fluid/platform/gen_comm_id_helper.h
浏览文件 @
df14dbf0
...
...
@@ -22,6 +22,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include "glog/logging.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -46,10 +48,20 @@ class SocketServer {
public:
SocketServer
()
=
default
;
~
SocketServer
()
{
CloseSocket
(
server_fd_
);
}
~
SocketServer
()
{
if
(
server_fd_
!=
-
1
)
{
CloseSocket
(
server_fd_
);
}
}
int
socket
()
const
{
return
server_fd_
;
}
void
Release
()
{
VLOG
(
3
)
<<
"Server will be closed by external call."
;
CloseSocket
(
server_fd_
);
server_fd_
=
-
1
;
}
static
SocketServer
&
GetInstance
(
const
std
::
string
&
end_point
);
private:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录